Hello all !
I have written a code such that given a webaddress it will find all the links in that page..
The problem is the code is unable to find the nested links.
Example:
I provide the site www.google.com
my code retrieves these files..
index.html
about.html
My query to you all is that , the code is unable to find the nested links under index.html and about.html
here is my code
<?php
$domain="http://www.google.com";
$links = getLinks($domain,$domain);
if(!empty($links->url))
{
$arr_urls1=implode(",",$links->url);
$durls1=explode(",",$arr_urls1);
$xurlsx=array_unique($durls1);
$aurls=array();
$i=0;
// copying non duplicate entries to a new array to avoid offset error
foreach($xurlsx as $strItem){
$aurls[$i]=$strItem;
$i++;
}
// Number of url links
//echo "number of url links = " . count( $aurls ) . "<br><br>";
// this loop is used to find the sub links from the found pages. but i cant achieve
for($i=0;$i<count($aurls);$i++)
{
$aurls_len=count($aurls);
//echo $aurls[$i];
//echo "<br>";
flush();
//echo "sending...".$aurls[$i]."<br>";
getLinks($aurls[$i],$domain);
if(!empty($links->url))
{
$arr_urls=implode(",",$links->url);
$xurls=explode(",",$arr_urls);//to array
//$xurls2=array_unique($temp_urls);
/* FYI
echo "<hr>";
flush();
for($j=0;$j<count($xurls);$j++)
{
echo $xurls[$j];
echo "<br>";
flush();
}
echo "<hr>";
flush();
*/
for($j=0;$j<count($xurls);$j++)
{
if(in_array($xurls[$j],$aurls))
{
//echo "found";
//flush();
}
else
{
//echo "not found";
//flush();
$aurls[$aurls_len]=$xurls[$j];
$aurls_len++;
}
}
unset($xurls);
unset($arr_urls);
}
}
echo "<hr>";
for($i=0;$i<count($aurls);$i++)
{
echo $aurls[$i];
echo "<br>";
}
echo "<hr>";
}
/*
for($i=0;$i<count($xurls);$i++)
{
echo $xurls[$i];
echo "<br>";
}
*/
//print_r(array_values($aurls));
//echo implode("<br>",$links0->javascript);
//curl_close($ch);
function getLinks($url,$host)
{
$ch = curl_init();
curl_setopt($ch,CURLOPT_URL, $url);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch,CURLOPT_FAILONERROR,true);
$source = curl_exec($ch);
$httpCode = curl_getinfo($ch,CURLINFO_HTTP_CODE);
echo "<br>Response:".$httpCode." For ".$url."<br>";
curl_close($ch);
preg_match_all("/<a (?:.*?)href=\"([^\"]+?)\"(?:[^>]*?)>/si", $source, $result );
//$links->email = array();
//$links->javascript = array();
$links->url = array();
if($httpCode==0 || $httpCode==404 || $httpCode==500 || $httpCode==400)
{
$links->url[]="";
//break;
}
else
{
foreach( $result[1] as $value )
{
if( strtolower( substr( $value, 0, 7 ) ) == "mailto:" )
{
// $links->email[] = substr( $value, 7 );
}
else if( strtolower( substr( $value, 0, 11 ) ) == "javascript:" )
{
// $links->javascript[] = substr( $value, 11 );
}
else
{
//index.html,www.xyz.com
if(stripos($value,$host)===true)
{
//$value=$host."/".$value;
$links->url[] = $value;
}
else
{
if(stripos($value,"http://")===false)
{
if(substr($value,0,1)=="/")
{
//substr_replace($value," ",0);
$value=$host.$value;
$links->url[] = $value;
}
else
{
$value=$host."/".$value;
$links->url[] = $value;
}
}
}
}//else
//else
}//for
}//else
return $links;
//flush();
}//func
?>
Please check the code .. It does not have any errors but it fails to find further links.