Turning this one-time script into a conditional loop
Results 1 to 4 of 4

Thread: Turning this one-time script into a conditional loop

  1. #1
    Member
    Join Date
    Oct 2016
    Posts
    41

    Turning this one-time script into a conditional loop

    Hi there everyone!

    I've got a script that I hope to modify to crawl all local links on a website. Right now, I've got it crawling the index and printing out the links found in it. What I need it to do is to keep crawling those links until there are no more to crawl.

    This initially seemed trivial to me but I'm having a surprisingly hard time even figuring out how to start this modification. I have some kind of fuzzy idea of creating an array of links that have been crawled and another array to hold the links that need to be crawled and during each loop add to the latter array and stop looping once the count of the two arrays match.

    Here's my start of building a system to loop:

    PHP Code:
    $site_domain 'https://wheeltastic.com';
    $crawled_array[] = '/';

    function 
    is_image($path){
        
    $a getimagesize($path);
        
    $image_type $a[2];
         
        if(
    in_array($image_type , array(IMAGETYPE_GIF IMAGETYPE_JPEG ,IMAGETYPE_PNG IMAGETYPE_BMP))){
            return 
    true;
        }
        return 
    false;
    }

    $options  = array('http' => array('user_agent' => 'Wheelie-Bot / Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'));
    $context  stream_context_create($options);

    $html file_get_contents($site_domainfalse$context);

    $dom = new DOMDocument();
    @
    $dom->loadHTML($html);

    // grab all the links on the page
    $xpath = new DOMXPath($dom);
    $hrefs $xpath->evaluate("/html/body//a");

    for (
    $i 0$i $hrefs->length$i++) {
        
    $href $hrefs->item($i);
        
    $url $href->getAttribute('href');
        
        if(!@
    is_image('.'.$url) AND substr$url0) === "/" AND !in_array($crawled_array)){
            echo 
    $url.'<br />';
            
    $tocrawl_array[] = $url;
        }

    You can see that I've created an array to represent crawled links and another to represent links to be crawled. I just can't figure out how to make a loop until those two arrays are the same count.

    Thanks for your time!
    Last edited by schwim2; 03-11-2017 at 10:45 AM. Reason: To show progress

  2. #2
    Member
    Join Date
    Oct 2016
    Posts
    41
    Sorry for the thread bump, but the edit window has expired. Here's a better representation of what I hoped to do. It seems not to work because the foreach of the $tocrawl_array is not able to handle any new additions to the array, so I need to find another way to do this:

    PHP Code:
    $site_domain 'https://wheeltastic.com';
    $tocrawl_array[] = '/';
    $crawled_array[] = array();

    function 
    is_image($path){
        
    $a getimagesize($path);
        
    $image_type $a[2];
         
        if(
    in_array($image_type , array(IMAGETYPE_GIF IMAGETYPE_JPEG ,IMAGETYPE_PNG IMAGETYPE_BMP))){
            return 
    true;
        }
        return 
    false;
    }

    $options  = array('http' => array('user_agent' => 'Wheelie-Bot / Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'));
    $context  stream_context_create($options);

    foreach(
    $tocrawl_array AS $crawl){
        
        if(!
    in_array($crawl$crawled_array)){

            
    $html file_get_contents($site_domain.$crawlfalse$context);

            
    $dom = new DOMDocument();
            @
    $dom->loadHTML($html);

            
    // grab all the links on the page
            
    $xpath = new DOMXPath($dom);
            
    $hrefs $xpath->evaluate("/html/body//a");

            for (
    $i 0$i $hrefs->length$i++) {
                
    $href $hrefs->item($i);
                
    $url $href->getAttribute('href');
                
                if(!@
    is_image('.'.$url) AND substr$url0) === "/" AND !in_array($url$crawled_array) AND !in_array($url$tocrawl_array)){
                    echo 
    $url.'<br />';
                    
    $tocrawl_array[] = $url;
                }
            }
            
    $crawled_array[] = $url;
        }


  3. #3
    Member
    Join Date
    Oct 2016
    Posts
    41
    Alright, this is my latest version, also a failure. I tried a while loop:

    PHP Code:
    $site_domain 'https://wheeltastic.com';
    $tocrawl_array[0] = '/';
    $crawled_array = array();

    function 
    is_image($path){
        
    $a getimagesize($path);
        
    $image_type $a[2];
         
        if(
    in_array($image_type , array(IMAGETYPE_GIF IMAGETYPE_JPEG ,IMAGETYPE_PNG IMAGETYPE_BMP))){
            return 
    true;
        }
        return 
    false;
    }

    $options  = array('http' => array('user_agent' => 'Wheelie-Bot / Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'));
    $context  stream_context_create($options);

    $cd 0;
    $tc 0;

    while(
    array_key_exists($tc$tocrawl_array)){
        
        echo 
    $tc;
        
        
    $crawl $tocrawl_array[$tc];
        if(!
    in_array($crawl$crawled_array)){

            
    $html file_get_contents($site_domain.$crawlfalse$context);
            
    $crawled_array[$cd] = $crawl;
            
    $cd = ++$cd;

            
    $dom = new DOMDocument();
            @
    $dom->loadHTML($html);

            
    // grab all the links on the page
            
    $xpath = new DOMXPath($dom);
            
    $hrefs $xpath->evaluate("/html/body//a");

            for (
    $i 0$i $hrefs->length$i++) {
                
    $href $hrefs->item($i);
                
    $url $href->getAttribute('href');
                
                if(!@
    is_image('.'.$url) AND substr$url0) === "/" AND !in_array($url$crawled_array) AND !in_array($url$tocrawl_array)){
                    echo 
    $url.'<br />';
                    
    $tn = ++$tc;
                    
    $tocrawl_array[$tn] = $url;
                }
            }
        }
        
    $tc = ++$tc;
        
        
    // Stop it early for testing
        
    if($tc 100){
            exit();
        }
    }

    print_r($tocrawl_array); 
    At the bottom, i printed the array to make sure that the next array key existed and it does.

    Code:
    />Array
    (
        [0] => /
        [1] => /?action=cart
        [2] => /?action=about
        [3] => /?action=contact
        [4] => /category/accessories
        [5] => /category/lights
        [6] => /category/mounts
        [7] => /category/wheel-accessories
        [8] => /category/wheels
        [9] => /category/winches
        [10] => /brand/method-race-wheels
        [11] => /brand/visionx
        [12] => /brand/baja-designs
        [13] => /brand/superwinch
        [14] => /brand/mile-marker
        [15] => /?action=my_stuff
        [16] => /?action=my_orders
        [17] => /?action=privacy
        [18] => /?action=tos
    )
    So although my intention was to have it go to $tocrawl_array[1], it seems to refuse to do so and stopped after the first iteration of the loop.

    I just don't know what else to try. I've exhausted my limited knowledge in matters such as these and will patiently wait for someone to point me in the correct direction

  4. #4
    Member
    Join Date
    Oct 2016
    Posts
    41
    I think I may have gotten it:

    PHP Code:
    $site_domain 'https://wheeltastic.com';
    $tocrawl_array[0] = '/';
    $crawled_array = array();

    function 
    is_image($path){
        
    $a getimagesize($path);
        
    $image_type $a[2];
         
        if(
    in_array($image_type , array(IMAGETYPE_GIF IMAGETYPE_JPEG ,IMAGETYPE_PNG IMAGETYPE_BMP))){
            return 
    true;
        }
        return 
    false;
    }

    $cd 0;
    $tc 0;
    $ii 0;

    while(
    array_key_exists($ii$tocrawl_array)){
        
        
    $crawl $tocrawl_array[$ii];
        if(!
    in_array($crawl$crawled_array)){

            
    $options  = array('http' => array('user_agent' => 'Wheelie-Bot / Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'));
            
    $context  stream_context_create($options);
            
    $html file_get_contents($site_domain.$crawlfalse$context);
            
    $crawled_array[$cd] = $crawl;
            
    $cd = ++$cd;

            
    $dom = new DOMDocument();
            @
    $dom->loadHTML($html);

            
    // grab all the links on the page
            
    $xpath = new DOMXPath($dom);
            
    $hrefs $xpath->evaluate("/html/body//a");

            for (
    $i 0$i $hrefs->length$i++) {
                
    $href $hrefs->item($i);
                
    $url $href->getAttribute('href');
                
                if(!@
    is_image('.'.$url) AND substr$url0) === "/" AND !in_array($url$crawled_array) AND !in_array($url$tocrawl_array)){
                    
                    
    $tc = ++$tc;
                    
    $tocrawl_array[$tc] = $url;
                    
                    if(
    $tc %25 == 0) {
                        echo 
    $tc.' records processed<br>';
                    }
                    
                    if(
    $tc == 1000){
                        
    print_r($crawled_array);
                        exit();
                    }
                    
                }
            }
        }
        
    $ii = ++$ii;

    It's taking a long time to process but I've got another thread related to performance issues so I'll ask about it over there.

Thread Information

Users Browsing this Thread

There are currently 1 users browsing this thread. (0 members and 1 guests)

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •