maikelvanmaurik/schrapert

v0.0.3 2020-04-01 11:41 UTC

This package is auto-updated.

Last update: 2024-09-16 07:00:36 UTC


README

Schrapert 是一个受到 scrapy 启发的爬虫库。它使用 React 来执行各种操作,例如下载请求和写入文件。

简单爬虫的示例

namespace Crawl;
use Schrapert\Spider;
use Schrapert\Crawl\ResponseInterface;
use Schrapert\Http\ResponseInterface as HttpResponse;
use Schrapert\Http\Request as HttpRequest;
use DOMDocument;
use DOMXPath;
use DOMElement;
class BlogSpider extends Spider
{    
    public function parse(ResponseInterface $response)
    {
        if(!$response instanceof HttpResponse) {
            return;
        }
        $doc = new DOMDocument('1.0');
        $doc->loadHTML((string)$response->getBody());
        $xpath = new DOMXPath($doc);
        $nodes = $xpath->query('//a');
        foreach($nodes as $node) {
            /* @var $node DOMElement */
            $uri = $this->uri->join($node->getAttribute('href'), $response->getUri());
            yield new HttpRequest($uri);
        }
    }
}