i write this script in order to extract external links(href without http) from a website . for parsing i use DOMdocument because it is advisable vs regex and i don't know if it is well written that'i the script
<?php
// It may take a whils to spider a website ...
set_time_limit(10000);
// Inculde the phpcrawl-mainclass
include_once('../PHPCrawl_083/PHPCrawl_083/libs/PHPCrawler.class.php');
//include ('2.php');
// Extend the class and override the handleDocumentInfo()-method
class MyCrawler extends PHPCrawler
{
function handleDocumentInfo(PHPCrawlerDocumentInfo $DocInfo) {
if (PHP_SAPI == "cli") $lb = "\n";
else {
$lb = "<br />";
$home_url = parse_url($DocInfo->url ,PHP_URL_HOST );
$dom = new DOMDocument();
$dom->loadHTML($DocInfo->url);
// all links in document
$dom->strictErrorChecking = FALSE;
// Get all the links
$links = $dom->getElementsByTagName("a");
foreach($links as $link) {
$href = $link->getAttribute("href");
if (strpos( $home_url['host'], $href) == -1) {
echo $link ;
}
}
}
}
}
$crawler = new MyCrawler();
$crawler->setURL("http://tunisie-web.org");
$crawler->addURLFilterRule("#\.(jpg|gif|png|pdf|jpeg|css|js)$#i");
$crawler->setWorkingDirectory("C:/Users/mayss/Documents/travailcrawl/");
$crawler->go();
?>
Aucun commentaire:
Enregistrer un commentaire