I have written a screen scrape of a Publishers Weekly top 25 list that basically works. But I want to link the book's cover art to my library's catalog, so I need to capture the author and title variables in the array and add them to the search url for the catalog.
The table that stores the data has the book's ranking in the first column, while the title and author are in divs in the second column, like so:
<table><tr><td>[ranking]</td><td><div>title</div><div>author</div></td></tr></table>
Here's my scrape's code. Do I want to combine the arrays somehow, or should I have done something differently with xpath? So I'm not sure if I should be focusing on the array piece, or xpath.
Thanks for your help.
spivey
<?php
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'http://www.publishersweekly.com/pw/bestsellers/hardcover-fiction.html');
curl_setopt($ch, CURLOPT_HEADER, 0); // no headers in output
curl_setopt($ch, CURLOPT_VERBOSE, 1); // verbose output, good for debugging
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);// $ch will return the results of your POST when you execute
curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");
$result = curl_exec ($ch);
curl_close($ch);
$doc = new DOMDocument();
@$doc->loadHTML($result);
$xpath = new DOMXpath($doc);
$rows = $xpath->query("/html/body/div/div/div/div/table//tr");
echo '<table><tr><td style="color: red;position:absolute;top:5px;line-height:4.97em;">';
$ranks = $xpath->query("/html/body/div/div/div/div/table/tr/td[1]");
foreach ($ranks as $rank ) {
echo $rank->nodeValue . '<br />';
}
echo '</td><td style="position: absolute;top:85px;left:100px;line-height:4.97em;">';
$bk_titles = $xpath->query("/html/body/div/div/div/div/table/tr/td[2]/div[1]");
foreach ($bk_titles as $bk_title) {
echo $bk_title->nodeValue . '<br />';
}
echo '</td><td style="position: absolute;top:85px;left:450px;line-height:4.97em;">';
$bk_auths = $xpath->query("//table[1]/tr/td[2]/div[2]");
foreach ($bk_auths as $bk_auth) {
$target_string = $bk_auth->nodeValue;
$slashpos = strpos($target_string, '/');
if ( $slashpos !== false )
{
$string1 = substr($target_string, 0, $slashpos);
}
else
{
$string1 = $target_string;
}
$title = str_replace(',', '', $string1);
echo $title . '<br />';
}
echo '</td><td style="position: absolute;top:90px;left:650px;">';
$bk_imgs = $xpath->query("//table[1]/tr/td[2]/img/@src");
foreach ($bk_imgs as $bk_img) {
echo '<a href="http://catalog.examplelibrary.org/uhtbin/cgisirsi/x/0/0/5?srchfield1=AU^AUTHOR^AUTHORS^Author%20Processing^author&searchdata1=' . $bk_title->nodeValue . '&searchoper1=AND&srchfield2=SU^SUBJECT^SUBJECTS^^subject&searchdata2=' . $title . '"><img style="height:70px;margin-bottom:10px;" src="http://www.publishersweekly.com/' . $bk_img->nodeValue . '" alt="cover art" /></a><br />';
}
echo "</td></tr></table>";
?>