laserlight
The new project uses Codeigntier and, while Codeigniter is apparently not built to make it easy to use prepared statements, this is apparently feasible. I supose you are suggesting this because it would provide some security as far as exploits go, but I can't help but wonder if that's a foolproof way to just feed user input directly into a query?
I also wonder if using a prepared statement via PDO is any more secure than using PDO::quote, which is what CodeIgniter apparently does with its PDO driver. I worry about just cramming user input into a query and relying on the prepared statement behaviors to prevent SQL injection. Is it really to be trusted? Will it prevent an SQL injection attack that uses a DELETE char, for instance?
And simply using prepared statements doesn't really address the issue of generating suitable SQL. I don't have any special attachment to the SQL generation code in this project but it does function adequately and it works by splitting any supplied user search string into keywords and searching various columns in various tables for whole and partial word matches. It sorts the search results by allocating score to different types REGEXP matches. Some tables count more than others, a full-word match counts more than just a partial word match. This rather ad-hoc logic, which I've cleaned up below, was originally dictated by the client and seems to serve the users reasonably well.
The following PHP functions are static methods in a class:
public static function search($db, $search_query){
$clean_keywords = self::clean_keywords($search_query);
$keyword_count = sizeof($clean_keywords);
// these generate arrays of regex patterns around each keyword
$full_regex = self::key_regex($clean_keywords, TRUE);
$part_regex = self::key_regex($clean_keywords, FALSE);
// array to accumulate all the disparate queries
$sql = array();
// record titles
for ($j=0; $j<$keyword_count; $j++) {
// TABLE 1: get records with keyword in title
$sql[] = "SELECT c.id_code AS c_i, c.title AS c_t, c.seo_title, ROUND(IF(c.title REGEXP $full_regex[$j], 40, 20),1) AS score, 'q_ct_{$j}' AS qid
FROM " . TITLE_TABLE . " c
WHERE c.title REGEXP $full_regex[$j] OR c.title REGEXP $part_regex[$j]";
}
// task statements
for ($j=0; $j<$keyword_count; $j++) {
// get records from tasks by full or partial word match
// NOTE: this sql is carefully constructed to allow for multiple tasks per record, some full match, some part. we score higher if both exist, BUT DO NOT SUM
$sql[] = "SELECT c.id_code AS c_i, c.title AS c_t, c.seo_title, ROUND(IF(ts.task REGEXP $full_regex[$j], 25, 12.5),1) AS score, CONCAT('q_ts_{$j}_', IF(ts.task REGEXP $full_regex[$j], 'full', 'part')) AS qid
FROM " . TASK_TABLE . " ts, " . TITLE_TABLE . " c
WHERE (ts.task REGEXP $full_regex[$j] OR ts.task REGEXP $part_regex[$j]) AND
ts.id_code = c.id_code
GROUP BY c_i, qid";
}
// alternate titles
for ($j=0; $j<$keyword_count; $j++) {
// get records from alternate titles by full or partial word match
// NOTE: this sql is carefully constructed to allow for multiple alternate titles per record, some full match, some part. we score higher if both exist, BUT DO NOT SUM
$sql[] = "SELECT c.id_code AS c_i, c.title AS c_t, c.seo_title, ROUND(IF(oat.alternate_title REGEXP $full_regex[$j], 20, 10),1) AS score, CONCAT('q_at_{$j}_', IF(oat.alternate_title REGEXP $full_regex[$j], 'full', 'part')) AS qid
FROM " . ALTERNATE_TITLE_TABLE . " oat, " . TITLE_TABLE . " c
WHERE (oat.alternate_title REGEXP $full_regex[$j] OR oat.alternate_title REGEXP $part_regex[$j]) AND
oat.id_code = c.id_code
GROUP BY c_i, qid";
}
// other data
for ($j=0; $j<$keyword_count; $j++) {
// get records from other data by full or partial word;
// NOTE: this sql is carefully constructed to allow for multiple occupation data per record, some full match, some part. we score higher if both exist, BUT DO NOT SUM
$sql[] = "SELECT c.id_code AS c_i, c.title AS c_t, c.seo_title, ROUND(IF(od.description REGEXP $full_regex[$j], 15, 7.5),1) AS score, CONCAT('q_od_{$j}_', IF(od.description REGEXP $full_regex[$j], 'full', 'part')) AS qid
FROM " . OTHER_DATA_TABLE . " od, " . TITLE_TABLE . " c
WHERE (od.description REGEXP $full_regex[$j] OR od.description REGEXP $part_regex[$j]) AND
od.id_code = c.id_code
GROUP BY c_i, qid";
}
// after we've accumulated all the keyword-specific queries, we make a UNION query to combine them into a single one
$combined_sql = "SELECT c_i, c_t, seo_title, SUM(score) AS score
FROM (" . implode("\nUNION\n", $sql) . ") AS union_query
GROUP BY c_i";
$retval = self::db_fetch($db, $combined_sql);
return $retval;
} // search()
/**
* Runs the specified query and returns an array of arrays
* @param CI_DB $db
* @param string $sql
*/
private static function db_fetch($db, $sql) {
$query = $db->query($sql);
$retval = array();
while ($row = $query->unbuffered_row("array")) {
$row["score"] = floatval($row["score"]);
$retval[] = $row;
}
return $retval;
}
/**
* Function to generate MySQL REGEXP patterns from each keyword
* @param array $clean_keywords numerically indexed array of keywords for which we search
* @param boolean $full TRUE if you want full word match, FALSE if you just want to match start-of-word
* @throws Exception
*/
private static function key_regex($clean_keywords, $full) {
if (!is_array($clean_keywords)) {
throw new Exception("Invalid clean keywords. Should be an array");
}
$i = 0;
$retval = array();
// FIXME we need to db escape and regex escape these values if we want the search to be safe and work for all input
foreach ($clean_keywords as $key => $value) {
if ($full) {
$retval[$i] = '\'[[:<:]]' . $value . '[[:>:]]\'';
} else {
$retval[$i] = '\'[[:<:]]' . $value. '\'';
}
$i++;
}
return $retval;
}
private static function clean_keywords($search_query) {
// TODO: should we allow fancy search syntax? NOTE that we'll feed this into
// a MySQL REGEXP expression so we would need to escape and filter properly
// convert any newline chars or multiple spaces into single spaces
$pattern = "/\s+/";
$clean_string = trim(preg_replace($pattern, " ", $search_query));
// use unicode regex to whitelist valid chars, excludes any chars that aren't letters, numbers, or spaces
$pattern = "/[^\pL\pN\pZ]/u";
$clean_string = preg_replace($pattern, "", $clean_string);
// get an array of all the words, this breaks along whitespace and punctuation
// this function seems to happily avoid empty entries and leading/trailing spaces
preg_match_all("/([\w\.]+)/", $clean_string, $search_words);
// remove trivial words
// TODO consider adding additional words here and move this into a static var or class constant
$skip_words = array('and', 'or');
foreach ($search_words[0] as $k=>$v) {
if (in_array(mb_strtolower($v), $skip_words)) {
unset($search_words[0][$k]);
}
}
return $search_words[0];
} // clean_keywords()
To call this function, you do something like this:
$search_query = trim($this->input->post("q"));
if (mb_strlen($search_query) == 0) {
// FIXME show the user an alert
show_error("You did not enter anything for your search.", 400);
}
// FIXME also check maximum string length here
$search_results = MY_site_search::search($this->db, $search_query);