phonyland/ngram

🧪 用于🙃伪语言模型的N-Gram工具,包括清洗、分词、n-gram提取、频率映射等功能

dev-master 2024-05-22 20:14 UTC

This package is auto-updated.

Last update: 2024-09-22 20:54:01 UTC


README

Phony Logo - Light Phony Logo - Dark

🧪
N-Gram工具

此仓库包含用于🙃伪语言的N-Gram工具,包括清洗、分词、n-gram提取、频率映射等功能。

🚀 安装

需要PHP >= 8.0

您可以通过composer安装此包

composer require phonyland/ngram

⌨️ 使用方法

分词器

单词分词

$tokenizer->tokenize($text);
⌨️ 使用方法
use Phonyland\NGram\Tokenizer;
use Phonyland\NGram\TokenizerFilter;

$tokenizer = new Tokenizer();
$tokenizer
  ->addWordSeparatorPattern(';')
  ->addWordSeparatorPattern('\s')
  ->addWordFilterRule(TokenizerFilterType::NO_SYMBOLS);

$text = 'sample   text;sample;text';

$tokenizer->tokenize($text);
🖥 输出
[
    "sample",
    "text",
    "sample",
    "text",
];

句子分词

$tokenizer->sentences($text);
⌨️ 使用方法
use Phonyland\NGram\Tokenizer;

$tokenizer = new Tokenizer();
$tokenizer
  ->addSentenceSeparatorPattern('.')
  ->addSentenceSeparatorPattern('!')
  ->addSentenceSeparatorPattern('?');

$text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End';

$tokenizer->sentences($text);
🖥 输出
[
    "Sample Sentence.",
    "Sample Sentence!",
    "Sample Sentence?",
    "Sample Sentence no.",
    "4?!",
    "Sample sample sentence...",
    "End",
];

按句子进行单词分词

$tokenizer->tokenizeBySentences($text);
⌨️ 使用方法
use Phonyland\NGram\Tokenizer;
use Phonyland\NGram\TokenizerFilter;

$tokenizer = new Tokenizer();
$tokenizer
  ->addSentenceSeparatorPattern('.')
  ->addSentenceSeparatorPattern('!')
  ->addSentenceSeparatorPattern('?')
  ->addWordFilterRule(TokenizerFilterType::NO_SYMBOLS)
  ->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);

$text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End';

$tokenizer->tokenizeBySentences($text);
🖥 输出
[
    ["Sample", "Sentence"],
    ["Sample", "Sentence"],
    ["Sample", "Sentence"],
    ["Sample", "Sentence", "no"],
    ["Sample", "sample", "sentence"],
    ["End"],
];

N-Gram

N-Gram序列

NGramSequence::multigram($n, $tokens, $isUnique);
NGramSequence::trigram($tokens, $isUnique);
NGramSequence::bigram($tokens, $isUnique);
NGramSequence::unigram($tokens, $isUnique);
⌨️ 使用方法
use Phonyland\NGram\Tokenizer;
use Phonyland\NGram\NGramSequence;
use Phonyland\NGram\TokenizerFilter;

$tokenizer = new Tokenizer();
$tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
$tokens = $tokenizer->tokenize('sample text');

NGramSequence::multigram(4, $tokens);
// ['samp', 'ampl', 'mple', 'text'];

// Generate Unique N-Grams 
NGramSequence::unigram($tokens, true);
// ['s', 'a', 'm', 'p', 'l', 'e', 't', 'x'];

带计数的N-Gram序列

NGramCount::multigram(4, $tokens);
NGramCount::trigram($tokens);
NGramCount::bigram($tokens);
NGramCount::unigram($tokens);

NGramCount::incrementElementCount($element, $elements);
⌨️ 使用方法
use Phonyland\NGram\Tokenizer;
use Phonyland\NGram\NGramCount;

$tokenizer = new Tokenizer();
$tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
$tokens = $tokenizer->tokenize('sample text');

NGramCount::multigram(4, $tokens);
// [
//     'samp' => 1,
//     'ampl' => 1,
//     'mple' => 1,
//     'text' => 1,
// ];

N-Gram频率

NGramFrequency::multigram(4, $tokens);
NGramFrequency::multigram($tokens);
NGramFrequency::bigram($tokens);
NGramFrequency::unigram($tokens);

NGramFrequency::frequencyFromCount($countArray);
⌨️ 使用方法
use Phonyland\NGram\Tokenizer;
use Phonyland\NGram\NGramFrequency;
use Phonyland\NGram\TokenizerFilter;

$tokenizer = new Tokenizer();
$tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
$tokenizer->addWordFilterRule(TokenizerFilterType::ALPHABETICAL);
$tokens = $tokenizer->tokenize('bombadil! bombadillo!');

NGramFrequency::multigram(4, $tokens);
//[
//    'bomb' => 0.16666666666666666,
//    'omba' => 0.16666666666666666,
//    'mbad' => 0.16666666666666666,
//    'badi' => 0.16666666666666666,
//    'adil' => 0.16666666666666666,
//    'dill' => 0.08333333333333333,
//    'illo' => 0.08333333333333333,
//]

🙃

使用🙃伪框架开始生成假数据,
访问主 Phony仓库

探索文档 » https://phony.land
在Twitter上关注我们 » @phony_land

🙃 伪
假数据生成框架


Yunus Emre Deligöz
创建于
MIT许可.