Sorry if this issue is too localized, but I am trying to update my scraping of the various bus lines in:
http://www.atm.it/it/Giromilano/Pagine/default.aspx by postingthe following curl requests that try to mimic its own way of transitioning between pages:
function scrape($line){
echo "scraping linea $line<br>";
$date= date("d//m//Y");
$hour= date("G:H");
$curl_connection=curl_init('http://www.atm.it/it/Giromilano/Pagine/default.aspx?lvid=137a6747-3533-43e4-967ab8b33a6b7d23-51adad95&wbt=nav&contextname=137a6747-3533-43e4-967ab8b33a6b7d23-51adad95&vp=348&lst=1&lsi=67');
curl_setopt($curl_connection, CURLOPT_CONNECTTIMEOUT, 30);
curl_setopt($curl_connection, CURLOPT_USERAGENT,
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)");
curl_setopt($curl_connection, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl_connection, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($curl_connection, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($curl_connection, CURLOPT_POSTFIELDS, array(
'__SPSCEditMenu'=>'true',
'MSOWebPartPage_PostbackSource'=>'',
'MSOTlPn_SelectedWpId'=>'',
'MSOTlPn_View'=>'0',
'MSOTlPn_ShowSettings'=>'False',
'MSOGallery_SelectedLibrary'=>'',
'MSOGallery_FilterString'=>'',
'MSOTlPn_Button'=>'none',
'__EVENTTARGET'=>'',
'__EVENTARGUMENT'=>'',
'__REQUESTDIGEST'=>'0x4093D9C485C5FE23A883267CAF6F1C875138278832090D75B83A9825F38D0B050AD84F5020F38DFC54F0693F1A7472F5C82349C1D1DA30929C56A69A2AD9F0B7,03 Jun 2013 13:06:00 -0000',
'MSOAuthoringConsole_FormContext'=>'',
'MSOAC_EditDuringWorkflow'=>'',
'MSOSPWebPartManager_DisplayModeName'=>'Browse',
'MSOWebPartPage_Shared'=>'',
'MSOLayout_LayoutChanges'=>'',
'MSOLayout_InDesignMode'=>'',
'MSOSPWebPartManager_OldDisplayModeName'=>'Browse',
'MSOSPWebPartManager_StartWebPartEditingName'=>'false',
'__LASTFOCUS'=>'',
'__VIEWSTATE'=>'...', //truncated to be posted here
'ctl00$SPWebPartManager1$g_bfee1d06_b90e_4116_9a45_603e7566dc51$txt_dp_lines'=>$line,
'ctl00$SPWebPartManager1$g_bfee1d06_b90e_4116_9a45_603e7566dc51$rd_ltype_ch'=>'0',
'ctl00$SPWebPartManager1$g_bfee1d06_b90e_4116_9a45_603e7566dc51$pg_v_pos'=>'0'));
//curl_setopt($curl_connection, CURLINFO_HEADER_OUT, true);
$contentType = 'text/xml';
$auth = '';
$header1 = 'Authentication-API-Key: 12345';
$charset= 'ISO-8859-1';
curl_setopt($curl_connection, CURLOPT_HTTPHEADER, Array('Content-type: '.$contentType.'; charset=' . $charset, $header1));
$output = curl_exec($curl_connection);
//echo "ottenendo $output<br>";
return $output;
}
Yet the result is extremely unreliable sometimes producing the right page, sometimes nothing and some other time a wrong page: you may test it at:
http://www.miafoto.it/iPhone/inarrivo/php/OO/testScrape.php
pasing the line apparently corresponding to number 67 in the lsi field of the get parameters.
Can someone give me some hint on how to proceeed?
Thanks,
Fabrizio Bartolomucci