bgruszka/ phpantispam
贝叶斯垃圾邮件识别库
v0.2.1
2016-08-30 13:10 UTC
Requires (Dev)
- phpunit/phpunit: 4.6.*
This package is not auto-updated.
Last update: 2024-09-28 18:00:00 UTC
README
PHPAntiSpam是一个库,用于识别文档/消息/文本是否为垃圾邮件。该库使用统计分析。
4个步骤的解释
- 创建分词器
- 从历史消息中创建语料库(包含词素)
- 选择用于分类的方法
- 分类消息
实现的方法
- Paul Graham方法
- Brian Burton方法
- Robinson几何平均测试方法
- Fisher-Robinson逆卡方检验方法
安装
composer require bgruszka/phpantispam "^0.2"
示例
<?php // First add autoloader and all necessary classes require_once 'vendor/autoload.php'; use PHPAntiSpam\Corpus\ArrayCorpus; use PHPAntiSpam\Classifier; use PHPAntiSpam\Tokenizer\WhitespaceTokenizer; // Let's decleare our example training set $messages = [ ['category' => 'spam', 'content' => 'this is spam'], ['category' => 'nospam', 'content' => 'this is'], ]; // As tokenizer we can use the simplest one - WhitespaceTokenizer (but of course you can also use RegexpTokenizer // or create new one) $tokenizer = new WhitespaceTokenizer(); // Let's define our corpus - collection of text documents $corpus = new ArrayCorpus($messages, $tokenizer); // For classifying text we can use different methods // ------------------------------------------------------------------------------------ // Graham method $classifier = new Classifier($corpus); $classifier->setMethod(new \PHPAntiSpam\Method\GrahamMethod($corpus)); $spamProbability = $classifier->isSpam('This is spam'); echo 'With Graham method:' . PHP_EOL; echo sprintf('Spam probability: %s', $spamProbability) . PHP_EOL; echo sprintf('Is spam: %s', $spamProbability < 0.9 ? 'NO' : 'YES') . PHP_EOL . PHP_EOL; // ------------------------------------------------------------------------------------ // Burton method $classifier = new Classifier($corpus); $classifier->setMethod(new \PHPAntiSpam\Method\BurtonMethod($corpus)); $spamProbability = $classifier->isSpam('This is spam'); echo 'With Burton method:' . PHP_EOL; echo sprintf('Spam probability: %s', $spamProbability) . PHP_EOL; echo sprintf('Is spam: %s', $spamProbability < 0.9 ? 'NO' : 'YES') . PHP_EOL . PHP_EOL; // ------------------------------------------------------------------------------------ // Robinson Geometric Mean Test Method $classifier = new Classifier($corpus); $classifier->setMethod(new \PHPAntiSpam\Method\RobinsonGeometricMeanTestMethod($corpus)); $spamProbability = $classifier->isSpam('This is spam'); echo 'With Robinson Geometric Mean Test method:' . PHP_EOL; echo sprintf( 'Spam probability: [spamminess: %s; hamminess: %s; combined: %s]', $spamProbability['spamminess'], $spamProbability['hamminess'], $spamProbability['combined'] ) . PHP_EOL; echo sprintf('Is spam: %s', $spamProbability['combined'] <= 0.55 ? 'NO' : 'YES') . PHP_EOL . PHP_EOL; // ------------------------------------------------------------------------------------ // Fisher-Robinson Inverse Chi Square Method $classifier = new Classifier($corpus); $classifier->setMethod(new \PHPAntiSpam\Method\FisherRobinsonInverseChiSquareMethod($corpus)); $spamProbability = $classifier->isSpam('This is spam'); echo 'With Fisher-Robinson Inverse Chi Square method:' . PHP_EOL; echo sprintf( 'Spam probability: [spamminess: %s; hamminess: %s; combined: %s]', $spamProbability['spamminess'], $spamProbability['hamminess'], $spamProbability['combined'] ) . PHP_EOL; echo sprintf('Is spam: %s', $spamProbability['combined'] <= 0.55 ? 'NO' : 'YES') . PHP_EOL;