Been wacking at this for days and I cant get it to work correctly.
Can someone give it a shot for me please...
I spiders nicely, just one thing, after it spiders, it starts over again.
IT ends up putting doubles into the database because of it.
I need it to stop at some point after it gathers information from all crawled links.
I know it doesnt look pretty but hey..
ANy help would be nice, and yeah I know there are many spiders out there but I can learn more this way, with some helpe of course.
<?php
ini_set('max_execution_time',0);
set_time_limit( 0 );
#
require "./conf/db_connect.inc.php";
require_once "./conf/display.functions.inc.php";
include("tagretrieval.php"); //gets title
// Where should we start searching from?
$start = "http://somesite.com/";
// Build information about the site we're going to search.
if($url = parse_url($start))
{
if(isset($url['scheme']))
{
$b_scheme = $url['scheme'];
$b_url = $b_scheme."://";
}
if(isset($url['host']))
{
$b_host = $url['host'];
$b_url = $b_url.$b_host;
}
if(isset($url['path']))
{
$b_path = dirname($url['path']);
$b_url = $b_url.$b_path;
}
}
else
{
echo("\nError!\n");
echo("Description: Unable to parse starting URL. ");
echo("Please enter a different URL to start from.\n");
echo("Starting URL: " .$start. "\n\n");
exit;
}
// Initialize our array of links.
$links = array($start => "0");
// Keep crawling until we run out of links.
while($p_link = array_search("0", $links))
{
// Mark this link as having been seen.
$links[$p_link] = "1";
// Get the contents of the link we're currently looking at.
// If we fail this, there's no point in going further.
// We're going to surpress PHP's warning messages here as well.
if(@ $contents = file_get_contents($p_link))
{
$query1 = "SELECT * FROM keyword2 where url='$p_link'";
$result1= mysql_query($query1) or die( "ERROR: " . mysql_error() . "n");
$num1 = mysql_num_rows($result1);
$meta = get_meta_tags($p_link);
$description = $meta[description];
$keyword = $meta[keywords];
$keyword = str_replace(".", ",", $keyword);
$kwords2 = explode(",", $keyword);
$description = str_replace("'", "", $description);
srand((double)microtime() * 10000000);
$originalArray = array("$kwords2[0]", "$kwords2[1]", "$kwords2[2]", "$kwords2[3]",
"$kwords2[4]", "$kwords2[5]", "$kwords2[6]", "$kwords2[7]", "$kwords2[8]",
"$kwords2[9]", "$kwords2[10]", "$kwords2[11]", "$kwords2[12]", "$kwords2[13]",
"$kwords2[15]", "$kwords2[16]", "$kwords2[17]", "$kwords2[18]", "$kwords2[19]", "$kwords2[20]");
$pickOne = array_rand($originalArray, 1);
$aRandomSelection = $originalArray[$pickOne ];
if($num1<=0){//if link isnt found in db, then we will insert
$file = file_get_contents($p_link);
$title = get_doc_title($file);
if(!empty($title[0])){
$title = strip_tags(trim($title[0]));
$title = str_replace("'","", $title);
$title = str_replace("-","", $title);
$title = str_replace("*","", $title);
$title = trim($title);
}else{
$title = strip_tags(trim($title[0]));
$title = str_replace("'","", $title);
$title = str_replace("-","", $title);
$title = str_replace("*","", $title);
$title = trim($title);
$title=$p_link;
}
if(($title!=="") && ($aRandomSelection!="") && ($description!="")){
$aRandomSelection2 = strtolower($aRandomSelection);
$title = strtolower($title);
$description = strtolower($description);
//$shortppctitle = substr($title, 0, 25) . "...";
if((preg_match("/^[a-zA-Z\.\,\_\-\'\ ]+$/u", $title)) && (preg_match("/^[a-zA-Z\.\,\_\-\'\ ]+$/u", $description)) && (preg_match("/^[a-zA-Z\.\,\_\-\'\ ]+$/u", $aRandomSelection2))) {
//here is where I insert into database the title, description and one random keyword
}
// What link are we following?
echo("Following link: " .$uniqid. "-<b>" .$p_link. "</b><BR>" .$title. "<BR>" .$description. "<BR><BR>");
}
//
}
// Build information about the link we're currently looking at.
if($url = parse_url($p_link))
{
$p_url = $p_link;
if(isset($url['scheme']))
{
$p_scheme = $url['scheme'];
$p_url = $p_scheme."://";
}
if(isset($url['host']))
{
$p_host = $url['host'];
$p_url = $p_url.$p_host;
}
if(isset($url['path']))
{
$p_path = dirname($url['path']);
$p_url = $p_url.$p_path;
}
}
// Extract the links from the current page.
preg_match_all("/href=\"(.*?)\"/", $contents, $link_results);
// Loop through our extracted links and manipulate them.
for($i = 0; $i < count($link_results[1]); $i++)
{
// Get an extracted link from out list and assume it's good.
$c_link = $link_results[1][$i];
$c_valid = true;
// Trim any whitespace that might be on our link.
$c_link = trim($c_link);
// Build information about our extracted link.
// If we can't parse the URL, don't continue.
// Surpress all PHP warnings here as well.
if(@ $url = parse_url($c_link))
{
if(isset($url['host']))
{
$c_host = $url['host'];
}
if(isset($url['query']))
{
$c_query = $url['query'];
}
if(isset($url['fragment']))
{
$c_fragment = $url['fragment'];
}
}
else
{
// If we won't be able to follow it, mark it as bad.
$c_valid = false;
}
if(preg_match("/\.(jpg|gif|png|ico)$/i", $c_link))
{
$c_valid = false;
}
elseif(preg_match("/\.(zip|rar|tar|gz)$/i", $c_link))
{
$c_valid = false;
}
elseif(preg_match("/\.(c|pl|py|js|reg|orig)$/i", $c_link))
{
$c_valid = false;
}
elseif(preg_match("/\.(exe|java|class)$/i", $c_link))
{
$c_valid = false;
}
elseif(preg_match("/\.(css|xml|txt|doc|pdf|lit)$/i", $c_link))
{
$c_valid = false;
}
elseif(preg_match("/\.(mp3|wav|ra|pm)$/i", $c_link))
{
$c_valid = false;
}
// If our link's made it this far, it's good, so let's keep it.
if($c_valid)
{
// Remove queries from the end of a link.
if(isset($c_query))
{
$c_link = preg_replace("/\?(.*?)$/", "", $c_link);
}
// Remove fragments from the end of a link.
if(isset($c_fragment))
{
$c_link = preg_replace("/#(.*?)$/", "", $c_link);
}
// Case 1: The URL is of the form: /directory/file
if(preg_match("/^\//", $c_link))
{
$c_link = $b_scheme."://".$b_host.$c_link;
}
// Case 2: The URL is of the form: ../directory/file
if(preg_match("/^\.\.\//", $c_link))
{
// How many directories will we have to backtrack into?
preg_match_all("/\.\.\//", $c_link, $count);
$count = count($count[0]);
// Remove the relative bits from our link.
$c_link = preg_replace("/\.\.\//", "", $c_link);
// Remove leading and trailing slashes from our path.
$p_path = preg_replace("/^\//", "", $p_path);
$p_path = preg_replace("/\/$/", "", $p_path);
// Backtrack the required number of directories.
$path_array = explode("/", $p_path);
$new_path = "";
for($j = $count; $j > 0; $j--)
{
array_pop($path_array);
}
for($j = 0; $j < count($path_array); $j++)
{
$new_path = $new_path.$path_array[$j]."/";
}
// Tack our new path onto the begining of our link.
$c_link = $p_scheme."://".$p_host."/".$new_path.$c_link;
}
// Case 3: The URL is of the form: ./directory/file
$c_link = preg_replace("/^\.\//", "", $c_link);
// Case 4: The URL is of the form: file
if(!preg_match("/^http:/", $c_link))
{
if(preg_match("/\/$/", $p_url))
{
$c_link = $p_url.$c_link;
}
else
{
$c_link = $p_url."/".$c_link;
}
}
// Remove any www. stuff from the start of our link.
$c_link = preg_replace("/^http:\/\/www\./", "http://", $c_link);
// Add our extracted list to our list of links to look at.
if(!array_key_exists($c_link, $links))
{
$links[$c_link] = "0";
}
}
}
}
}
?>