README

用于爬取磁盘/网络资源的 Symfony 扩展包。
蜘蛛机器人可以导航其他磁盘或网络资源。
网络爬虫可以通过路径掩码和其他选项进行自定义。
爬虫扫描 HTML 文档提取超链接并将它们推送到下一轮迭代的索引池中。

安装

composer require andrew-svirin/resource-crawler-bundle:dev-main

将以下内容添加到 doctrine.yaml 以避免表与实体关联。

doctrine:
    dbal:
        schema_filter: ~^(?!resource_crawler_)~

将以下内容添加到 resource_crawler.yaml 以避免表与实体关联。

resource_crawler:
  process:
    is_lockable: true
    store: 'resource_crawler.process_db_store'
#    store: 'resource_crawler.process_file_store'
    file_store:
      dir: "%kernel.project_dir%/storage/saver"

添加迁移 Version20230101010000.php

<?php

declare(strict_types=1);

namespace DoctrineMigrations;

use Doctrine\DBAL\Schema\Schema;
use Doctrine\Migrations\AbstractMigration;

final class Version20230101010000 extends AbstractMigration
{
  public function getDescription(): string
  {
    return '';
  }

  public function up(Schema $schema): void
  {
    $this->addSql('CREATE TABLE resource_crawler_processes (
            id INT AUTO_INCREMENT NOT NULL,
            `name` VARCHAR(1024) NOT NULL,
            PRIMARY KEY(id)
    ) DEFAULT CHARACTER SET utf8mb4 COLLATE `utf8mb4_unicode_ci` ENGINE = InnoDB');
    $this->addSql('CREATE TABLE resource_crawler_nodes (
            id INT AUTO_INCREMENT NOT NULL,
            process_id INT NOT NULL,
            `status` ENUM("for_processing", "in_process", "processed", "ignored", "errored") NOT NULL,
            `type` ENUM("html", "img") NOT NULL,
            `uri_type` ENUM("http", "fs") NOT NULL,
            `uri_path` VARCHAR(4096) NOT NULL,
            `code` INT UNSIGNED,
            PRIMARY KEY(id),
            INDEX (`process_id`),
            INDEX (`status`)
    ) DEFAULT CHARACTER SET utf8mb4 COLLATE `utf8mb4_unicode_ci` ENGINE = InnoDB');
    $this->addSql('ALTER TABLE resource_crawler_nodes
        ADD CONSTRAINT FK_NODE_PROCESS FOREIGN KEY (process_id) REFERENCES resource_crawler_processes (id)');
  }

  public function down(Schema $schema): void
  {
    $this->addSql('ALTER TABLE resource_crawler_nodes DROP FOREIGN KEY FK_NODE_PROCESS');
    $this->addSql('DROP TABLE resource_crawler_nodes');
    $this->addSql('DROP TABLE resource_crawler_processes');
  }
}

使用方法

    use \AndrewSvirin\ResourceCrawlerBundle\Crawler\Ref\Ref;  
    use \AndrewSvirin\ResourceCrawlerBundle\Crawler\RefHandlerClosureInterface;  
    use \AndrewSvirin\ResourceCrawlerBundle\Process\Task\CrawlingTask;  
    
    /* @var $resourceCrawler \AndrewSvirin\ResourceCrawlerBundle\Crawler\ResourceCrawler */
    
    // Resolve service by alias or by class.
    $resourceCrawler = $this->getContainer()->get('resource_crawler.crawler');

    $url = 'https://site.com/index.html';
    $pathMasks = ['+site.com/', '-embed'];
    $substitutionRules = [
      ['/(#other-anchor)/i', ''], // remove anchor `other-anchor`
      ['/(\?.*)([&*]h=[^&#]*)(.*)/i', '$1$3'], // remove query param `h`
      ['/(\?.*)([&*]w=[^&#]*)(.*)/i', '$1$3'], // remove query param `w`
    ];
    $op = new class() implements RefHandlerClosureInterface {
      public function call(Ref $ref, CrawlingTask $task): void
      {
        // Here is possible to handle reference in task node.
      }
    };

    // Do one of multiple crawl iteration.
    $task = $resourceCrawler->crawlWebResource($url, $pathMasks, $substitutionRules, $op);
    
    // Take analyze of resource crawling.
    $analyze = $resourceCrawler->analyzeCrawlingWebResource($url);
    
    if($someExceptionCondition){
        // Move task back for be crawled again.
        $resourceCrawler->rollbackTask($task);
    }

    // Reset all crawling related data.
    $resourceCrawler->resetWebResource($url);

开发

使用 make build 准备基础设施
使用 make start 启动基础设施
使用 make install 安装依赖项
运行调试测试： make xdebug filter=value

andrew-svirin / resource-crawler-bundle

维护者

详细信息

README

安装

使用方法

开发