I have finally gotten my website spider to work the way i want, but now, i am wanting to optimize it. The only way i know to make this script work 100% of the time is by the code below.
<?php
$link=0;
$PagesToCrawl = 6;
$yoursite="sparkletags.com";
$url = $_GET['url'];
$donotallow = Array('css','ico','#','/','com','net','org','info','de','uk','xml','com/','net/','org/','info/','de/','uk/','exe','zip','tar','gz',
'txt','jpg','gif','png','bmp','mp3','wma','avi','wav','swf'
);
$shorturl1 = explode(".", $url);
$shorturl = $shorturl1[1].".".$shorturl1[2];
$lines = file($url);
//echo $shorturl."<br><br>";
$page=array();
$p=0;
foreach ($lines as $line_num => $line) {
if(stristr(htmlspecialchars($line), $yoursite) !== FALSE){
$link++;
}
preg_match_all('|\shref=[\'"]([^\'"]+)["\']|i', $line, $part); // change back to $line if needed"
//var_dump($part);
foreach($part as $hr){
if($PagesToCrawl != 0){
foreach($hr as $hre){
if($hre != "" && stristr($hre, $shorturl) !== FALSE && stristr($hre, "href") === FALSE || $hre != "" && stristr($hre, "http") === FALSE && stristr($hre, "href") === FALSE){
$stripped=explode("?PHPSESSID", $hre);
$stripped=explode("&PHPSESSID", $stripped[0]);
$pagetype = explode(".", $stripped[0]);
$sizeoffile=sizeOf($pagetype)-1;
$type=$pagetype[$sizeoffile];
$newpage = $stripped[0];
$dostop=0;
foreach($donotallow as $bad){
if($type == $bad) $dostop++;
if($newpage == $bad) $dostop++;
}
if($PagesToCrawl != 0 && $dostop == 0 && stristr($hre, "javascript") === FALSE && stristr($hre, "mailto") === FALSE){
$page[$p]=$newpage;
$p++;
$PagesToCrawl--;
}
}
}
}
}
}
echo "Found ".$link." on ".$shorturl."/<br>";
$link=0;
foreach ($page as $new){
if(stristr($new, "http://") === FALSE){
$lines = file($url."/".$new);
} else {
$lines = file($new);
}
foreach ($lines as $line_num => $line) {
if(stristr(htmlspecialchars($line), $yoursite) !== FALSE){
$link++;
}
}
echo "Found ".$link." on ".$new."<br>";
$link=0;
}
?>
Usage : test.php?url=http://www.gothics-r-us.com
and : test.php?url=http://www.revolutionmyspace.com
Thanks!