aloy789 wrote:extracts data from a website
Most of the time this is against the terms of the website you are scraping.
aloy789 wrote:I m really stuck on how to extract the information and will be grateful if somebody can help or give me some guide?
Check out [man]cURL[/man]
Here is something I recently used (until the project was frozen)
$curl = curl_init();
$user = 'REMOVED';
$pass = 'REMOVED';
$url = 'https://REMOVED';
/** give cURL the URL **/
curl_setopt($curl, CURLOPT_URL, $url);
/** set the HTTP protocol to 1.1 **/
curl_setopt($curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
/** put return into a buffer instead of STDOUT **/
curl_setopt($curl, CURLOPT_RETURNTRANSFER, TRUE);
/** fake the user agent **/
curl_setopt($curl, CURLOPT_USERAGENT, "Mozilla/4.0");
/** setup which kind of authentication **/
//curl_setopt($curl, CURLOPT_HTTPAUTH, CURLAUTH_NTLM);
/** give cURL the user and password **/
//curl_setopt($curl, CURLOPT_USERPWD, $user.":".$password);
/** accept the SSL cert **/
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE);
/** debugging **/
// Set callback function for headers
//curl_setopt($curl, CURLOPT_HEADERFUNCTION, 'read_header');
// Set callback function for body
//curl_setopt($curl, CURLOPT_WRITEFUNCTION, 'read_body');
// define callback functions
// Notes from <http://curl.haxx.se/libcurl/c/curl_easy_setopt.html>:
// Return the number of bytes actually written or return -1 to signal error to
// the library (it will cause it to abort the transfer with a CURLE_WRITE_ERROR
// return code). (Added in 7.7.2)
function read_header($curl, $string)
{
$length = strlen($string);
echo "Header: $string<br />\n";
return $length;
}
// Notes from <http://curl.haxx.se/libcurl/c/curl_easy_setopt.html>:
// Return the number of bytes actually taken care of. If that amount differs
// from the amount passed to your function, it'll signal an error to the library
// and it will abort the transfer and return CURLE_WRITE_ERROR.
function read_body($curl, $string)
{
$length = strlen($string);
echo "Received $length bytes<br />\n";
return $length;
}
/** end debugging **/
$res = curl_exec($curl);
$err = curl_error($curl);
if(empty($err)){
echo $res;
} else {
die(curl_error($curl));
}
curl_close($curl);
edit
After you've got the data, you will want to parse it with something like [man]split[/man] or [man]preg_match[/man] and/or [man]preg_replace[/man]