Resume aborted crawling-processes
Sometimes it may happen that a crawling-process (or a script thats using phpcrawl) gets aborted/terminated while spidering a website before it was finished completely (for whatever reasons).
Since PHPCrawl 0.81 it is possible to resume such a terminated script/process from the point where it was halted (so it's not necessary to restart the script all over again).
In order to be able to resume a crawling-process, you'll have to
- Initially call the enableResumption() method in your script (from the first start, this prepares the crawler for possible
resumption and is necessary to be able to resume the script later on)
-
Determinate the unique crawler-ID by calling getCrawlerId() and store it somewhere (this ID is needed for identifying the process that should be resumed later on)
// ...
$crawler = new MyCrawler();
$crawler->setURL("www.anyurl.com");
$crawler->enableResumption();
$ID = $crawler->getCrawlerId();
// ...
?>
- Call the resume()-method before calling the go() or goMultiProcessed() method and pass the crawler-ID of the terminated crawling-process
you want to resume to it (as returned by getCrawlerId())
// ...
$crawler->resume(120912912109); //ID of the aborted process
$crawler->goMultiProcessed(5);
// ...
<?php
// Inculde the phpcrawl-mainclass
include("libs/PHPCrawler.class.php");
// Extend the class and override the handleDocumentInfo()-method
class MyCrawler extends PHPCrawler
{
function handleDocumentInfo($DocInfo)
{
// Just detect linebreak for output
if (PHP_SAPI == "cli") $lb = "\n";
else $lb = "<br />";
// Print the URL
echo "Page requested: ".$DocInfo->url.$lb;
flush();
}
}
$crawler = new MyCrawler();
$crawler->setURL("www.php.net");
$crawler->addContentTypeReceiveRule("#text/html#");
$crawler->addURLFilterRule("#\.(jpg|jpeg|gif|png)$# i");
$crawler->setPageLimit(50); // Set page-limit to 50 for testing
// Important for resumable scripts/processes!
$crawler->enableResumption();
// At the firts start of the script retreive the crawler-ID
// and store it
// (in a temporary file in this example)
if (!file_exists("/tmp/mycrawlerid_for_php.net.tmp"))
{
$crawler_ID = $crawler->getCrawlerId();
file_put_contents("/tmp/mycrawlerid_for_php.net.tmp", $crawler_ID);
}
// If the script was restarted again (after it was aborted),
// read the crawler-ID and pass it to the resume() method.
else
{
$crawler_ID = file_get_contents("/tmp/mycrawlerid_for_php.net.tmp");
$crawler->resume($crawler_ID);
}
// Start crawling
$crawler->goMultiProcessed(5);
// Delete the stored crawler-ID after the process is finished
// completely and successfully.
unlink("/tmp/mycrawlerid_for_php.net.tmp");
$report = $crawler->getProcessReport();
if (PHP_SAPI == "cli") $lb = "\n";
else $lb = "<br />";
echo "Summary:".$lb;
echo "Links followed: ".$report->links_followed.$lb;
echo "Documents received: ".$report->files_received.$lb;
echo "Bytes received: ".$report->bytes_received." bytes".$lb;
echo "Process runtime: ".$report->process_runtime." sec".$lb;
?>