grab / 爬虫
此包已被弃用且不再维护。未建议替代包。
PHP 异步爬虫使用多curl和reactphp,并受到Python抓取的启发
0.2
2017-05-10 14:43 UTC
Requires
- php: >=5.4.0
- imangazaliev/didom: *
- khr/react-curl: *
- react/event-loop: *
- react/promise: ^2.4
Requires (Dev)
- codeclimate/php-test-reporter: dev-master
- squizlabs/php_codesniffer: *
This package is not auto-updated.
Last update: 2020-08-26 22:38:54 UTC
README
PHP 异步爬虫使用多curl和reactphp,受到Python抓取的启发
安装
要安装 grab-spider,请运行以下命令
composer require grab/spider "dev-master"
快速开始
<?php require __DIR__ . '/../vendor/autoload.php'; class HackerNewCrawler extends \Grab\Spider { public function taskGenerator() { $range = array_map(function($item) { return sprintf('https://news.ycombinator.com/news?p=%d', $item); }, range(1, 4)) ; foreach ($range as $url) { $this->task('page', [ 'url' => $url, 'max_request' => 10, ]); } } public function taskPage($parser, $task) { $links = $parser->find('.storylink'); foreach ($links as $link) { $this->task('topic', [ 'url' => $link->getAttribute('href'), 'curl_config' => [ CURLOPT_TIMEOUT => 60, ], 'max_request' => 10, ]); } } public function taskTopic($parser, $task) { $products = $parser->find('title'); echo trim($products[0]->text()) . PHP_EOL; } } $bot = new HackerNewCrawler(); $bot->debug = true; $bot->setCurlSetting([ CURLOPT_USERAGENT => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', ]); //$bot->loadProxy(__DIR__ . '/proxy_list.txt'); $bot->run();
从更改解析器获取简单DI
$parser = new \DiDom\Document(); $bot = new HackerNewCrawler([$parser, 'load']); $bot = new HackerNewCrawler(function ($content) { $parser = new \DiDom\Document(); return $parser->load($content); }); $bot = new HackerNewCrawler(function ($content) { return simplexml_load_string($content); }); $bot = new HackerNewCrawler(function ($content) { return new \SoapClient($content); });