The following is my PHP class object that will scrape a remote URL and time its evaluation and download. I am sorry if this is long but I want to make it as clear as possible.
Here are my steps to doing the remote scrape using CURL:
1) I scrape the remote URL with CURLOPT_HEADER set to 1 to obtain the HTTP headers
2) I glean off the headers the value of PHPSESSID to get the session ID of the remote URL
3) I set that to an object property $this->PHPSESSID
4) I then re-scrape the remote URL (with CURLOPT_HEADER set to 0) while setting a cookie string in a method into $qs, making $qs a semicolon-delimited string of cookie key=val pairs with spaces in between the semicolons.
5) I set up another CURL resource object and then re-scrape the site.
And it fails! The remote URL is displayed completely wrong because the required cookies (including the session id) is completely gone from the remote HTTP headers; CURL never does Set-cookie in spite of the cookies being set absolutely correctly.
Here is the code, I'm at my wit's end:
class Timer extends View {
/**
* @access private
* @var mixed $cookieName
*/
var $cookieName;
/**
* @access private
* @var mixed $PHPSESSID
*/
var $PHPSESSID;
/**
* @access private
* @var float $startTime
*/
var $startTime;
/**
* @access private
* @var mixed $url
*/
var $url;
/**
* Constructor. Set optional URL property. Set optional $cookieName property either through parameter or via $_REQUEST autoglobal with name of 'cookieName'
*
* @access public
* @param mixed $url (optional)
* @param mixed $cookieName (optional)
*/
function Timer($url = '', $cookieName = '') { // CONSTRUCTOR
$this->url = $url;
if ($cookieName) $this->cookieName = $cookieName;
if ($_REQUEST['cookieName']) $this->cookieName = $_REQUEST['cookieName'];
}
//-------------------------------------------- --* GETTER/SETTER METHODS *-- ------------------------------------------
/**
* Retrieve $PHPSESSID
*
* @access private
* @return mixed $PHPSESSID
*/
function &getRemoteSessionID() { // STATIC STRING METHOD
return $this->PHPSESSID;
}
/**
* Retrieve $url property
*
* @access private
* @return mixed $url
*/
function &getURL() { // STATIC STRING METHOD
return $this->url;
}
/**
* Set the cookie in $_COOKIE into the curl reference
*
* @access private
* @param resource $ch (reference) curl reference
*/
function &setCookieCurlSetOpt(&$ch) { // STATIC VOID METHOD
/*----------------------------------------------------------------------------------------------------------------------------
Remember that unlike $_GET or $_POST requests which use '&' to "glue" all key/val pairs together,
$_COOKIE requires a semicolon instead to use as its glue to adjoin all cookie key/val pairs together
-----------------------------------------------------------------------------------------------------------------------------*/
if ($this->cookieName) {
$qs = ';' . $this->cookieName . '=';
if (is_array($_COOKIE[$this->cookieName]) || is_object($_COOKIE[$this->cookieName])) {
$qs .= serialize($_COOKIE[$this->cookieName]);
} else {
$qs .= $_COOKIE[$this->cookieName];
}
} elseif (@sizeof(array_values($_COOKIE)) > 0) {
foreach ($_COOKIE as $key => $val) if (is_array($val) || is_object($val)) $qs .= "; $key=" . serialize($val); else $qs .= "; $key=$val";
}
$PHPSESSID = $this->getRemoteSessionID();
if ($qs && $PHPSESSID) $qs .= "; PHPSESSID=$PHPSESSID";
if ($qs) curl_setopt($ch, CURLOPT_COOKIE, trim(substr($qs, 1, strlen($qs))));
print_r("qs = " . trim(substr($qs, 1, strlen($qs))) . "<P>");
}
/**
* Set the $_POST curl set_opt options per instance of $_POST
*
* @access private
* @param resource $ch (reference) curl reference
*/
function &setPOSTCurlSetOpt(&$ch) { // STATIC VOID METHOD
if (@sizeof(array_values($_POST)) > 0) {
curl_setopt($ch, CURLOPT_POST, 1);
foreach ($_POST as $key => $val) $qs .= "&$key=" . urlencode(serialize($val));
curl_setopt($ch, CURLOPT_POSTFIELDS, substr($qs, 1, strlen($qs)));
}
}
/**
* Set $this->PHPSESSID with a "pop" connection to the remote site
*
* @access private
*/
function &setRemoteSessionID() { // STATIC STRING METHOD
if (is_object($this) && !$this->getURL() && $url) $this->setURL($url);
if (is_object($this)) $url = $this->getURL();
if ($url && ini_get('allow_url_fopen')) {
$url = $this->configureURL($url);
// grab URL and pass it to the browser
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_0);
$header .= curl_exec($ch);
curl_close($ch);
preg_match('/PHPSESSID=([a-zA-Z0-9]+);/i', $header, $matchArray);
$this->PHPSESSID = $matchArray[1];
}
}
/**
* Set the time
*
* @access private
* @param float $timeKeeper (reference)
* @return float $timeKeeper
*/
function &setTime(&$timeKeeper) { // STATIC FLOAT METHOD
$start = microtime();
$start = explode(' ', $start);
$start = (float)$start[1] + (float)$start[0];
$timeKeeper = $start;
return $timeKeeper;
}
/**
* Set $url property
*
* @access private
* @param mixed $url
*/
function &setURL($url) { // STATIC VOID METHOD
if (is_object($this) && !$this->url) $this->url = $url;
}
//--------------------------------------------- --* END OF GETTER/SETTER METHODS *-- -------------------------------
/**
* Configure URL
*
* @access private
* @param mixed $url (reference)
* @return mixed $url
*/
function &configureURL(&$url) { // STATIC STRING METHOD
if (preg_match('/\/[a-zA-Z0-9\-_]+$/i', $url)) $url .= '/';
return $url;
}
/**
* Display HTML based on given URL property value
*
* @access public
* @param mixed $url (optional)
* @return mixed HTML
*/
function &displayHTML($url = '') { // STATIC HTML STRING METHOD
global $projectFolderName, $username, $password;
if (is_object($this) && !$this->getURL() && $url) $this->setURL($url);
if (is_object($this)) $url = $this->getURL();
if ($url && ini_get('allow_url_fopen')) {
$url = $this->configureURL($url);
$this->setRemoteSessionID(); // GET REMOTE SESSION ID IF FOUND - REQUIRES RE-SCRAPING REMOTE SITE ONCE SESSION EXISTS
print_r("<P>this->PHPSESSID = "); print_r($this->getRemoteSessionID()); print_r("<P>");
// SET UP CURL RESOURCE OBJECT BASED UPON RECONFIGURED URL
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_0);
$this->setCookieCurlSetOpt($ch);
curl_setopt($ch, CURLOPT_SSLVERSION, 3);
curl_setopt($ch, CURLOPT_URL, $url);
$this->setTime($this->startTime); // SET START TIME
$html .= curl_exec($ch);
$timer = $this->setTime($timer);
// CLOSE CURL RESOURCE OBJECT
curl_close($ch);
$html = preg_replace('/(.*<body[^>]*>)/i', '$1<p><b>' . ($timer - $this->startTime) . " seconds to run URL: \"$url\"</b>", $html);
}
return $html;
}
}
Phil