andrew-svirin / resource-crawler-bundle
适用于任何类型的资源爬虫。
dev-main
2023-03-05 22:07 UTC
Requires
- php: >=8.1
- ext-dom: *
- ext-libxml: *
- symfony/framework-bundle: ^6.1
- symfony/http-client: ^6.1
- symfony/lock: ^6.1
- symfony/yaml: ^6.1
Requires (Dev)
- doctrine/doctrine-bundle: ^2.7
- doctrine/orm: ^2.13
- phpmd/phpmd: ^2.13
- phpstan/phpstan: ^1.8
- phpunit/phpunit: ^9.5
- sebastian/phpcpd: ^6.0
- squizlabs/php_codesniffer: ^3.7
This package is auto-updated.
Last update: 2024-09-06 01:08:35 UTC
README
用于爬取磁盘/网络资源的 Symfony 扩展包。
蜘蛛机器人可以导航其他磁盘或网络资源。
网络爬虫可以通过路径掩码和其他选项进行自定义。
爬虫扫描 HTML 文档提取超链接并将它们推送到下一轮迭代的索引池中。
安装
composer require andrew-svirin/resource-crawler-bundle:dev-main
将以下内容添加到 doctrine.yaml
以避免表与实体关联。
doctrine:
dbal:
schema_filter: ~^(?!resource_crawler_)~
将以下内容添加到 resource_crawler.yaml
以避免表与实体关联。
resource_crawler:
process:
is_lockable: true
store: 'resource_crawler.process_db_store'
# store: 'resource_crawler.process_file_store'
file_store:
dir: "%kernel.project_dir%/storage/saver"
添加迁移 Version20230101010000.php
<?php declare(strict_types=1); namespace DoctrineMigrations; use Doctrine\DBAL\Schema\Schema; use Doctrine\Migrations\AbstractMigration; final class Version20230101010000 extends AbstractMigration { public function getDescription(): string { return ''; } public function up(Schema $schema): void { $this->addSql('CREATE TABLE resource_crawler_processes ( id INT AUTO_INCREMENT NOT NULL, `name` VARCHAR(1024) NOT NULL, PRIMARY KEY(id) ) DEFAULT CHARACTER SET utf8mb4 COLLATE `utf8mb4_unicode_ci` ENGINE = InnoDB'); $this->addSql('CREATE TABLE resource_crawler_nodes ( id INT AUTO_INCREMENT NOT NULL, process_id INT NOT NULL, `status` ENUM("for_processing", "in_process", "processed", "ignored", "errored") NOT NULL, `type` ENUM("html", "img") NOT NULL, `uri_type` ENUM("http", "fs") NOT NULL, `uri_path` VARCHAR(4096) NOT NULL, `code` INT UNSIGNED, PRIMARY KEY(id), INDEX (`process_id`), INDEX (`status`) ) DEFAULT CHARACTER SET utf8mb4 COLLATE `utf8mb4_unicode_ci` ENGINE = InnoDB'); $this->addSql('ALTER TABLE resource_crawler_nodes ADD CONSTRAINT FK_NODE_PROCESS FOREIGN KEY (process_id) REFERENCES resource_crawler_processes (id)'); } public function down(Schema $schema): void { $this->addSql('ALTER TABLE resource_crawler_nodes DROP FOREIGN KEY FK_NODE_PROCESS'); $this->addSql('DROP TABLE resource_crawler_nodes'); $this->addSql('DROP TABLE resource_crawler_processes'); } }
使用方法
use \AndrewSvirin\ResourceCrawlerBundle\Crawler\Ref\Ref; use \AndrewSvirin\ResourceCrawlerBundle\Crawler\RefHandlerClosureInterface; use \AndrewSvirin\ResourceCrawlerBundle\Process\Task\CrawlingTask; /* @var $resourceCrawler \AndrewSvirin\ResourceCrawlerBundle\Crawler\ResourceCrawler */ // Resolve service by alias or by class. $resourceCrawler = $this->getContainer()->get('resource_crawler.crawler'); $url = 'https://site.com/index.html'; $pathMasks = ['+site.com/', '-embed']; $substitutionRules = [ ['/(#other-anchor)/i', ''], // remove anchor `other-anchor` ['/(\?.*)([&*]h=[^&#]*)(.*)/i', '$1$3'], // remove query param `h` ['/(\?.*)([&*]w=[^&#]*)(.*)/i', '$1$3'], // remove query param `w` ]; $op = new class() implements RefHandlerClosureInterface { public function call(Ref $ref, CrawlingTask $task): void { // Here is possible to handle reference in task node. } }; // Do one of multiple crawl iteration. $task = $resourceCrawler->crawlWebResource($url, $pathMasks, $substitutionRules, $op); // Take analyze of resource crawling. $analyze = $resourceCrawler->analyzeCrawlingWebResource($url); if($someExceptionCondition){ // Move task back for be crawled again. $resourceCrawler->rollbackTask($task); } // Reset all crawling related data. $resourceCrawler->resetWebResource($url);
开发
- 使用
make build
准备基础设施 - 使用
make start
启动基础设施 - 使用
make install
安装依赖项 - 运行调试测试:
make xdebug filter=value