Ok, looking for more feedback now lol. Changes made: SITE_URL instead of url, DOMDocument instead of preg_match_all, preg_grep instead of checking each value myself, SimpleXML to build the xml, and make a curl class because I was having a problem getting a static handle to work in the function. I've tested it and the following works and gives me what I would expect:
<?php
/**
* Site map generator
*/
// Setup
$s = microtime(TRUE);
set_time_limit(0);
ob_start();
/** Site URL **/
define( 'SITE_URL', 'http://---.com/'); // obfuscated for public forum
/** Sitemap file with full path **/
$xmlfile = '/home/---/public_html/sitemap.xml'; // obfuscated for public forum
// URLs to add to put in the sitemap
$urls = array(SITE_URL);
// Fill the array
getUrls(SITE_URL,$urls);
$urls = array_unique($urls);
// xml starter
$xml = '<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"></urlset>';
$xml = new SimpleXMLElement($xml);
// Change frequency options
$freqs = array('weekly','weekly','monthly');
foreach( $urls as $url ) {
// determine how often the current page "might" change
$freq = substr_count($url,'/') - 3;
// add current page to the sitemap
addURL($xml,$url,$freqs[$freq]);
}
// Attempt to put the new sitemap in the file
if( file_put_contents($xmlfile,$xml->asXML()) ) {
// New sitemap success!
echo 'Generated new sitemap at '.$xmlfile.' on '.date('Y-m-d')."\n";
} else {
// New sitemap failure :(
echo 'Failed to generate new site map on '.date('Y-m-d')."\n";
}
// Elapsed time for sitemap creation
$e = microtime(TRUE);
printf("Completed sitemap in %f\n",($e-$s));
// Turn the output into log contents
$out = ob_get_clean();
file_put_contents('sitemap.log','a',FILE_APPEND);
// Adds a given url and frequency to the specified xml
function addURL(SimpleXMLElement &$xml,$url,$freq) {
$t = $xml->addChild('url');
$t->addChild('loc',$url);
$t->addChild('changefreq',$freq);
}
function getUrls($url,&$urls) {
static $myCurl;
if( !isset($myCurl) ) $myCurl = new myCurl();
// Get the page contents from the supplied $url
$page = $myCurl->getContents($url);
sleep(5);
// Load document
$doc = new DOMDocument();
$doc->loadHTML($page);
// get all anchor tags
$anchors = $doc->getElementsByTagName('a');
$hrefs = array();
// loop thru all the anchor tags
foreach( $anchors as $anchor ) {
$hrefs[] = $anchor->getAttribute('href');
}
// Remove external links
$hrefs = preg_grep('/^'.preg_quote(SITE_URL,'/').'/',$hrefs);
// Remove links to resources
$hrefs = preg_grep('/\.(css|js|jpg|jpeg|bmp|gif|zip|txt)$/i',$hrefs,PREG_GREP_INVERT);
// Remove links already in our list
$hrefs = array_diff($hrefs,$urls);
foreach( $hrefs as $href ) {
// add link to the result array
$urls[] = $href;
// Page hasn't been parsed, get the links on that page (Recurse this function)
getUrls($href,$urls);
}
}
// Use curl to get the contents of a $url
class myCurl {
private $ch;
public function __construct() {
// Create curl handle
$this->ch = curl_init();
// Set curl options
curl_setopt($this->ch, CURLOPT_HEADER, 0);
curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($this->ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($this->ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:9.0.1) Gecko/20100101 Firefox/9.0.1');
curl_setopt($this->ch, CURLOPT_HEADER, false);
}
public function getContents($url) {
// set option to the url
curl_setopt($this->ch, CURLOPT_URL,$url);
// returns the contents
return curl_exec($this->ch);
}
public function __destruct() {
// close the handle
curl_close($this->ch);
}
}
Thanks again for all your help.