brittainmedia / phpcrawl
此包已被 放弃 并不再维护。没有建议的替代包。
PHPCrawl 是一个用 PHP 编写的网络爬虫/网络蜘蛛库。它支持过滤器、限制器、Cookie 处理、robots.txt 处理、多进程处理等。
0.10.1
2023-04-19 21:40 UTC
README
phpcrawl
composer require brittainmedia/phpcrawl
use PHPCrawl\Enums\PHPCrawlerAbortReasons; use PHPCrawl\Enums\PHPCrawlerMultiProcessModes; use PHPCrawl\Enums\PHPCrawlerUrlCacheTypes; use PHPCrawl\PHPCrawler; use PHPCrawl\PHPCrawlerDocumentInfo; // New custom crawler $crawler = new class() extends PHPCrawler { /** * @param $PageInfo * @return int */ function handleDocumentInfo($PageInfo): int { // Print the URL of the document echo "URL: " . $PageInfo->url . PHP_EOL; // Print the http-status-code echo "HTTP-statuscode: " . $PageInfo->http_status_code . PHP_EOL; // Print the number of found links in this document echo "Links found: " . count($PageInfo->links_found_url_descriptors) . PHP_EOL; // .. // continue crawling return 1; } }; $crawler->setURL($url = 'https://bbc.co.uk/news'); // Optional //$crawler->setProxy($proxy_host, $proxy_port, $proxy_username, $proxy_password); // Only receive content of files with content-type "text/html" $crawler->addContentTypeReceiveRule('#text/html#'); // Ignore links to ads... $advertFilterRule = "/\bads\b|2o7|a1\.yimg|ad(brite|click|farm|revolver|server|tech|vert)|at(dmt|wola)|banner|bizrate|blogads|bluestreak|burstnet|casalemedia|coremetrics|(double|fast)click|falkag|(feedster|right)media|googlesyndication|hitbox|httpads|imiclk|intellitxt|js\.overture|kanoodle|kontera|mediaplex|nextag|pointroll|qksrv|speedera|statcounter|tribalfusion|webtrends/"; $crawler->addURLFilterRule($advertFilterRule); // Store and send cookie-data like a browser does $crawler->enableCookieHandling(true); // Limits set, successfully retrieved only $crawler->setRequestLimit(1); /** * 3 - The crawler only follows links to pages or files located in or under the same path like the one of the root-url.</b> * E.g. if the root-url is * "http://www.foo.com/bar/index.html", * the crawler will follow links to "http://www.foo.com/bar/page.html" and "http://www.foo.com/bar/path/index.html", * but not links to "http://www.foo.com/page.html". * */ $crawler->setFollowMode(3); // Keep going until resolved $crawler->setFollowRedirectsTillContent(TRUE); // tmp directory $crawler->setWorkingDirectory(sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'phpcrawl' .DIRECTORY_SEPARATOR); // Cache $crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_MEMORY); // File crawling - Store to file or set limit for large files #$crawler->addStreamToFileContentType('##'); #$crawler->setContentSizeLimit(500000); // Google only crawls pages 500kb and below? //Decides whether the crawler should obey "nofollow"-tags, we will obey $crawler->obeyNoFollowTags(true); //Decides whether the crawler should obey robot.txt, we will not obey! $crawler->obeyRobotsTxt(false); // Delay to stop blocking $crawler->setRequestDelay(0.5); // fake browser or use fake robot one $crawler->setUserAgentString('Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'); // Multiprocess (optional) - Forces PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE use, must have link priorities! $crawler->addLinkPriority("/news/", 10); $crawler->addLinkPriority("/\.jpeg/", 5); $crawler->goMultiProcessed(PHPCrawlerMultiProcessModes::MPMODE_CHILDS_EXECUTES_USERCODE); // Thats enough, now here we go $crawler->go(); // At the end, after the process is finished, we print a short // report (see method getProcessReport() for more information) $report = $crawler->getProcessReport(); echo 'Finished crawling site: ' . $url . PHP_EOL; echo 'Summary:' . PHP_EOL; echo 'Links followed: ' . $report->links_followed . PHP_EOL; echo 'Documents received: ' . $report->files_received . PHP_EOL; echo 'Bytes received: ' . $report->bytes_received . ' bytes' . PHP_EOL; echo 'Process runtime: ' . $report->process_runtime . ' sec' . PHP_EOL; echo 'Process memory: ' . $report->memory_peak_usage . ' sec' . PHP_EOL; echo 'Server connect time: ' . $report->avg_server_connect_time . ' sec' . PHP_EOL; echo 'Server response time: ' . $report->avg_server_response_time . ' sec' . PHP_EOL; echo 'Server transfer rate: ' . $report->avg_proc_data_transfer_rate . ' bytes' . PHP_EOL; $abortReason = $report->abort_reason; switch ($abortReason) { case PHPCrawlerAbortReasons::ABORTREASON_PASSEDTHROUGH: echo 'Crawling-process aborted because everything is done/passed through.' . PHP_EOL; break; case PHPCrawlerAbortReasons::ABORTREASON_TRAFFICLIMIT_REACHED: echo 'Crawling-process aborted because the traffic limit set by user was reached.' . PHP_EOL; break; case PHPCrawlerAbortReasons::ABORTREASON_FILELIMIT_REACHED: echo 'Crawling-process aborted because the file limit set by user was reached.' . PHP_EOL; break; case PHPCrawlerAbortReasons::ABORTREASON_USERABORT: echo 'Crawling-process aborted because the handleDocumentInfo-method returned a negative value.' . PHP_EOL; break; default: echo 'Unknown abort reason.' . PHP_EOL; break; }
最初只是 http://phpcrawl.cuab.de/ 的一个副本,从 mmerian 分支出来,用于与 composer 一起使用。
由于主项目似乎现在已被放弃(已4年没有更新),我将继续在这个仓库中做出任何更改/修复。
最新更新
- 0.9 兼容 PHP 7。
- 0.10 兼容 PHP 8. (提交问题)
- 引入了命名空间
- 许多错误修复
- 重构了各种类部分
现在已存档...