mardi 4 août 2015

parsing website with domdocument

i write this script in order to extract external links(href without http) from a website . for parsing i use DOMdocument because it is advisable vs regex and i don't know if it is well written that'i the script

<?php 

  // It may take a whils to spider a website ... 
     set_time_limit(10000); 

  // Inculde the phpcrawl-mainclass 
 include_once('../PHPCrawl_083/PHPCrawl_083/libs/PHPCrawler.class.php'); 
  //include ('2.php');  
  // Extend the class and override the handleDocumentInfo()-method 

  class MyCrawler extends PHPCrawler 

   {   
    function handleDocumentInfo(PHPCrawlerDocumentInfo $DocInfo) {

        if (PHP_SAPI == "cli") $lb = "\n"; 
         else {
        $lb = "<br />"; 

         $home_url = parse_url($DocInfo->url ,PHP_URL_HOST ); 

         $dom = new DOMDocument();
          $dom->loadHTML($DocInfo->url);

          // all links in document
       $dom->strictErrorChecking = FALSE;

         // Get all the links
         $links = $dom->getElementsByTagName("a");
        foreach($links as $link) {
         $href = $link->getAttribute("href");


          if (strpos( $home_url['host'], $href) == -1) {

        echo $link ;
         }

       }


           }
         }
     }
    $crawler = new MyCrawler(); 
     $crawler->setURL("http://tunisie-web.org"); 

   $crawler->addURLFilterRule("#\.(jpg|gif|png|pdf|jpeg|css|js)$#i"); 
   $crawler->setWorkingDirectory("C:/Users/mayss/Documents/travailcrawl/"); 
        $crawler->go(); 

   ?>

Aucun commentaire:

Enregistrer un commentaire