OK I hope somebody might still be listening here. I've cooked up a pretty elaborate apache conf file that i call seo.conf that deals with only a portion of my site. It works with a PHP map script to turn boring ids into lovely seo-friendly text phrases.
It's doing a fine job rewriting this:
http://192.168.1.2:8888/education/db/ug/ug_3.php?id=166027
to this:
http://192.168.1.2:8888/schools/undergraduate_colleges/Harvard-University/academics/166027/0
However, I'm feeling quite shaky about this whole thing. Some questions:
1) My rewrite directives result in EVERY SINGLE REQUEST ON MY SITE going through NUMEROUS rewrite directives. E.g, a single request for /index.php results in 9 RewriteCond directives getting processed. If I continue down this vein, I can reasonably expect this number to rise to 50 or more. Surely this will hurt the performance of my site dramatically? What is a reasonable number of RewriteCond directives to be processed for a single page request?
2) The last directive in my seo.conf file (see below) is there to remove a bit of query string that relates to a session id. Basically my site (some legacy code) will append a session id to nearly every url in an effort to propagate the sid should cookies be turned off. My redirect map program, in trying to honor and preserve that functionality, will append the sid after the last slash in the SEO-friendly url. If cookies are on, this results in sid=0 which could wipe a user's session so I must strip that off. If anyone sees a better way to handle this, I would appreciate knowing about it.
3) Can one send a 404 from the map program?
4) What's the story with RewriteLock? The documentation is rather tightlipped about the need for it. Am I to understand that if I use a PHP script (or any other program) as a rewrite map that I need a RewriteLock file? What are the required permissions on this file? Can I assume that apache needs to read/write it and will maintain anything it might contain?
5) is PHP a poor choice for this application? If my map daemon crashes, won't part of my site will go dark? Would I be better off writing this in C or something?
Any help would be MUCH appreciated.
Behold seo.conf:
RewriteEngine on
RewriteOptions MaxRedirects=5
RewriteLog /Applications/MAMP/htdocs/rewrite.log
RewriteLogLevel 9
RewriteMap seo prg:/Applications/MAMP/htdocs/map.php
#map requests for the original file to the new SEO friendly urls
RewriteCond %{REQUEST_FILENAME} ^/education/db/ug/ug
RewriteCond %{DOCUMENT_ROOT}%{REQUEST_FILENAME} -f
RewriteCond %{QUERY_STRING} ^(.+)$
RewriteRule ^/education/db/ug/(ug.*)$ ${seo:$1?%1}? [L,R=301]
# map general-info back to ug_1.php
RewriteCond %{REQUEST_FILENAME} ^/schools/undergraduate_colleges/[^/]+/general-info/([^/]+)/(.+)$
RewriteRule .+ /education/db/ug/ug_1.php?id=%1&sid=%2
# map campus-and-students back to ug_2.php
RewriteCond %{REQUEST_FILENAME} ^/schools/undergraduate_colleges/[^/]+/campus-and-students/([^/]+)/(.+)$
RewriteRule .+ /education/db/ug/ug_2.php?id=%1&sid=%2
# map academics back to ug_3.php
RewriteCond %{REQUEST_FILENAME} ^/schools/undergraduate_colleges/[^/]+/academics/([^/]+)/(.+)$
RewriteRule .+ /education/db/ug/ug_3.php?id=%1&sid=%2
# map cost-and-aid back to ug_4.php
RewriteCond %{REQUEST_FILENAME} ^/schools/undergraduate_colleges/[^/]+/cost-and-aid/([^/]+)/(.+)$
RewriteRule .+ /education/db/ug/ug_4.php?id=%1&sid=%2
# map admissions back to ug_5.php
RewriteCond %{REQUEST_FILENAME} ^/schools/undergraduate_colleges/[^/]+/admissions/([^/]+)/(.+)$
RewriteRule .+ /education/db/ug/ug_5.php?id=%1&sid=%2
# map articles back to ug_6.php
RewriteCond %{REQUEST_FILENAME} ^/schools/undergraduate_colleges/[^/]+/articles/([^/]+)/(.+)$
RewriteRule .+ /education/db/ug/ug_6.php?id=%1&sid=%2
# map general-info back to ug_7.php
RewriteCond %{REQUEST_FILENAME} ^/schools/undergraduate_colleges/[^/]+/community/([^/]+)/(.+)$
RewriteRule .+ /education/db/ug/ug_7.php?id=%1&sid=%2
# remove emtpy SID from the query string
RewriteCond %{QUERY_STRING} ^(.+)&sid=0$
RewriteRule ^(.+)$ $1?%1
You will note that it makes reference to a MapRewrite script called map.php. Here's that script:
#!/Applications/MAMP/bin/php5/bin/php
<?php
error_reporting(E_ALL);
set_time_limit(0); # forever program!
define('THIS_DIR', dirname(realpath(__FILE__)));
define('LOG_FILE', THIS_DIR . DIRECTORY_SEPARATOR . 'rewrite_log.txt');
file_put_contents(LOG_FILE, 'rewrite script starting ' . date('Y-m-d H:i:s') . "\n");
// database connect
#define('MYSQL_HOST', 'localhost');
define('MYSQL_HOST', ':/Applications/MAMP/tmp/mysql/mysql.sock');
define('MYSQL_DB', 'test');
define('MYSQL_USER', 'root');
define('MYSQL_PASSWORD', 'root');
define('SCHOOL_TABLE', 'live_schools');
if (!($db = mysql_connect(MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD))) {
die('could not connect');
}
if ($db === FALSE || !is_resource($db)) {
write_log("Could not connect to the MySQL database");
die();
} else {
write_log("Connection to mysqldb successful");
}
if (mysql_select_db(MYSQL_DB, $db)) {
write_log("Selection of database successful");
} else {
write_log("Could not select db for college rewrites");
die();
}
$sid_pattern = '#(&|/?)sid=(.*)(&|$)#i';
$pattern = '#(.+)\.php\?id=(\d+)($|&)#';
$keyboard = fopen("php://stdin","r");
while (1) {
$line = trim(fgets($keyboard));
write_log('line:' . $line);
// extract $sid if any
$matches = NULL;
if (preg_match($sid_pattern, $line, $matches)) {
$sid = $matches[2];
} else {
$sid = 0;
}
$found = FALSE;
// try to grab the id from it
$matches = NULL;
if (preg_match($pattern, $line, $matches)) {
write_log(print_r($matches, true));
$file = get_seo_friendly_file($matches[1]);
write_log('returned ' . $file);
$id = $matches[2];
write_log('id=' . $id);
if (($file != '') && ($id != '')) {
$sql = "SELECT name FROM " . SCHOOL_TABLE . " WHERE unitid=" . strval($id);
$res = @mysql_query($sql, $db);
if ($res) {
if (mysql_num_rows($res) >= 1) {
if ($row = mysql_fetch_assoc($res)) {
# output the new url
$url = '/schools/undergraduate_colleges/' . preg_replace('#[^a-z]+#i', '-', $row['name']) . '/' . $file . '/' . $id . '/' . $sid;
$found = TRUE;
print $url . "\n";
} else {
write_log('college rewrite query failed to fetch a row:' . print_r($row, true));
}
} else {
write_log('college rewrite rows returned is less than 1');
}
mysql_free_result($res);
} else {
write_log('college rewrite query returned no resource:' . mysql_error(). "\n" . $sql);
}
} else {
write_log('file or college id blank');
}
} else {
write_log('no match for id pattern in college rewrite');
}
if (!$found) {
return_404();
}
}
function write_log($msg) {
file_put_contents(LOG_FILE, $msg . "\n", FILE_APPEND);
}
function return_404() {
print "/404.php\n";
}
function get_seo_friendly_file($file) {
write_log('running seo friendly on ' . $file);
switch($file) {
case 'ug_1':
return 'general-info';
break;
case 'ug_2':
return 'campus-and-students';
break;
case 'ug_3':
return 'academics';
break;
case 'ug_4':
return 'cost-and-aid';
break;
case 'ug_5':
return 'admissions';
break;
case 'ug_6':
return 'articles';
break;
case 'ug_7':
return 'community';
break;
default:
return '';
}
}
?>