diff options
Diffstat (limited to 'vendor/patrickschur/language-detection/src/LanguageDetection')
6 files changed, 501 insertions, 0 deletions
diff --git a/vendor/patrickschur/language-detection/src/LanguageDetection/Language.php b/vendor/patrickschur/language-detection/src/LanguageDetection/Language.php new file mode 100644 index 000000000..c369a3367 --- /dev/null +++ b/vendor/patrickschur/language-detection/src/LanguageDetection/Language.php @@ -0,0 +1,102 @@ +<?php + +declare(strict_types = 1); + +namespace LanguageDetection; + +/** + * Class Language + * + * @copyright Patrick Schur + * @license https://opensource.org/licenses/mit-license.html MIT + * @author Patrick Schur <patrick_schur@outlook.de> + * @package LanguageDetection + */ +class Language extends NgramParser +{ + /** + * @var array<string, array<string, int>> + */ + protected $tokens = []; + + /** + * Loads all language files + * + * @param array $lang List of ISO 639-1 codes, that should be used in the detection phase + * @param string $dirname Name of the directory where the translations files are located + */ + public function __construct(array $lang = [], string $dirname = '') + { + if (empty($dirname)) + { + $dirname = __DIR__ . '/../../resources/*/*.php'; + } + else if (!\is_dir($dirname) || !\is_readable($dirname)) + { + throw new \InvalidArgumentException('Provided directory could not be found or is not readable'); + } + else + { + $dirname = \rtrim($dirname, '/'); + $dirname .= '/*/*.php'; + } + + $isEmpty = empty($lang); + $tokens = []; + + foreach (\glob($dirname) as $file) + { + if ($isEmpty || \in_array(\basename($file, '.php'), $lang)) + { + $tokens += require $file; + } + } + + foreach ($tokens as $lang => $value) { + $this->tokens[$lang] = \array_flip($value); + } + } + + /** + * Detects the language from a given text string + * + * @param string $str + * @return LanguageResult + */ + public function detect(string $str): LanguageResult + { + $str = \mb_strtolower($str); + + $samples = $this->getNgrams($str); + + $result = []; + + if (\count($samples) > 0) + { + foreach ($this->tokens as $lang => $value) + { + $index = $sum = 0; + + foreach ($samples as $v) + { + if (isset($value[$v])) + { + $x = $index++ - $value[$v]; + $y = $x >> (PHP_INT_SIZE * 8); + $sum += ($x + $y) ^ $y; + continue; + } + + $sum += $this->maxNgrams; + ++$index; + } + + $result[$lang] = 1 - ($sum / ($this->maxNgrams * $index)); + } + + \arsort($result, SORT_NUMERIC); + } + + return new LanguageResult($result); + } +} diff --git a/vendor/patrickschur/language-detection/src/LanguageDetection/LanguageResult.php b/vendor/patrickschur/language-detection/src/LanguageDetection/LanguageResult.php new file mode 100644 index 000000000..5b89ff44e --- /dev/null +++ b/vendor/patrickschur/language-detection/src/LanguageDetection/LanguageResult.php @@ -0,0 +1,149 @@ +<?php + +declare(strict_types = 1); + +namespace LanguageDetection; + +/** + * Class LanguageResult + * + * @copyright Patrick Schur + * @license https://opensource.org/licenses/mit-license.html MIT + * @author Patrick Schur <patrick_schur@outlook.de> + * @package LanguageDetection + */ +class LanguageResult implements \JsonSerializable, \IteratorAggregate, \ArrayAccess +{ + const THRESHOLD = .025; + + /** + * @var array + */ + private $result = []; + + /** + * LanguageResult constructor. + * @param array $result + */ + public function __construct(array $result = []) + { + $this->result = $result; + } + + /** + * @param mixed $offset + * @return bool + */ + public function offsetExists($offset): bool + { + return isset($this->result[$offset]); + } + + /** + * @param mixed $offset + * @return mixed|null + */ + public function offsetGet($offset): ?float + { + return $this->result[$offset] ?? null; + } + + /** + * @param mixed $offset + * @param mixed $value + * @return void + */ + public function offsetSet($offset, $value): void + { + if (null === $offset) { + $this->result[] = $value; + } else { + $this->result[$offset] = $value; + } + } + + /** + * @param mixed $offset + */ + public function offsetUnset($offset): void + { + unset($this->result[$offset]); + } + + /** + * @return array + */ + public function jsonSerialize(): array + { + return $this->result; + } + + /** + * @return string + */ + public function __toString(): string + { + return (string) \key($this->result); + } + + /** + * @param \string[] ...$whitelist + * @return LanguageResult + */ + public function whitelist(string ...$whitelist): LanguageResult + { + return new LanguageResult(\array_intersect_key($this->result, \array_flip($whitelist))); + } + + /** + * @param \string[] ...$blacklist + * @return LanguageResult + */ + public function blacklist(string ...$blacklist): LanguageResult + { + return new LanguageResult(\array_diff_key($this->result, \array_flip($blacklist))); + } + + /** + * @return array + */ + public function close(): array + { + return $this->result; + } + + /** + * @return LanguageResult + */ + public function bestResults(): LanguageResult + { + if (!\count($this->result)) + { + return new LanguageResult; + } + + $first = \array_values($this->result)[0]; + + return new LanguageResult(\array_filter($this->result, function ($value) use ($first) { + return ($first - $value) <= self::THRESHOLD ? true : false; + })); + } + + /** + * @return \ArrayIterator + */ + public function getIterator(): \ArrayIterator + { + return new \ArrayIterator($this->result); + } + + /** + * @param int $offset + * @param int|null $length + * @return LanguageResult + */ + public function limit(int $offset, int $length = null): LanguageResult + { + return new LanguageResult(\array_slice($this->result, $offset, $length)); + } +} diff --git a/vendor/patrickschur/language-detection/src/LanguageDetection/NgramParser.php b/vendor/patrickschur/language-detection/src/LanguageDetection/NgramParser.php new file mode 100644 index 000000000..8b69241eb --- /dev/null +++ b/vendor/patrickschur/language-detection/src/LanguageDetection/NgramParser.php @@ -0,0 +1,153 @@ +<?php + +declare(strict_types = 1); + +namespace LanguageDetection; + +use LanguageDetection\Tokenizer\TokenizerInterface; +use LanguageDetection\Tokenizer\WhitespaceTokenizer; + +/** + * Class NgramParser + * + * @copyright Patrick Schur + * @license https://opensource.org/licenses/mit-license.html MIT + * @author Patrick Schur <patrick_schur@outlook.de> + * @package LanguageDetection + */ +abstract class NgramParser +{ + /** + * @var int + */ + protected $minLength = 1; + + /** + * @var int + */ + protected $maxLength = 3; + + /** + * @var int + */ + protected $maxNgrams = 310; + + /** + * @var TokenizerInterface + */ + protected $tokenizer = null; + + /** + * @param int $minLength + * @throws \LengthException + */ + public function setMinLength(int $minLength) + { + if ($minLength <= 0 || $minLength >= $this->maxLength) + { + throw new \LengthException('$minLength must be greater than zero and less than $this->maxLength.'); + } + + $this->minLength = $minLength; + } + + /** + * @param int $maxLength + * @throws \LengthException + */ + public function setMaxLength(int $maxLength) + { + if ($maxLength <= $this->minLength) + { + throw new \LengthException('$maxLength must be greater than $this->minLength.'); + } + + $this->maxLength = $maxLength; + } + + /** + * @param int $maxNgrams + * @throws \LengthException + */ + public function setMaxNgrams(int $maxNgrams) + { + if ($maxNgrams <= 0) + { + throw new \LengthException('$maxNgrams must be greater than zero.'); + } + + $this->maxNgrams = $maxNgrams; + } + + /** + * Sets the tokenizer + * + * @param TokenizerInterface $tokenizer + */ + public function setTokenizer(TokenizerInterface $tokenizer) + { + $this->tokenizer = $tokenizer; + } + + /** + * @param string $str + * @return array + */ + private function tokenize(string $str) + { + if (null === $this->tokenizer) + { + $this->tokenizer = new WhitespaceTokenizer(); + } + + return $this->tokenizer->tokenize($str); + } + + /** + * @param string $str + * @return array + */ + protected function getNgrams(string $str): array + { + $tokens = []; + + foreach ($this->tokenize($str) as $word) + { + $l = \mb_strlen($word); + + for ($i = $this->minLength; $i <= $this->maxLength; ++$i) + { + for ($j = 0; ($i + $j - 1) < $l; ++$j, ++$tmp) + { + $tmp = &$tokens[$i][\mb_substr($word, $j, $i)]; + } + } + } + + foreach ($tokens as $i => $token) + { + $sum = \array_sum($token); + + foreach ($token as $j => $value) + { + $tokens[$i][$j] = $value / $sum; + } + } + + if (!\count($tokens)) + { + return []; + } + + $tokens = \array_merge(...$tokens); + unset($tokens['_']); + + \arsort($tokens, SORT_NUMERIC); + + return \array_slice( + \array_keys($tokens), + 0, + $this->maxNgrams + ); + } +} diff --git a/vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/TokenizerInterface.php b/vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/TokenizerInterface.php new file mode 100644 index 000000000..f06074628 --- /dev/null +++ b/vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/TokenizerInterface.php @@ -0,0 +1,18 @@ +<?php + +declare(strict_types = 1); + +namespace LanguageDetection\Tokenizer; + +/** + * Interface TokenizerInterface + * + * @copyright Patrick Schur + * @license https://opensource.org/licenses/mit-license.html MIT + * @author Patrick Schur <patrick_schur@outlook.de> + * @package LanguageDetection + */ +interface TokenizerInterface +{ + public function tokenize(string $str): array; +}
\ No newline at end of file diff --git a/vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/WhitespaceTokenizer.php b/vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/WhitespaceTokenizer.php new file mode 100644 index 000000000..68bb6013a --- /dev/null +++ b/vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/WhitespaceTokenizer.php @@ -0,0 +1,29 @@ +<?php + +declare(strict_types = 1); + +namespace LanguageDetection\Tokenizer; + +/** + * Class WhitespaceTokenizer + * + * @copyright Patrick Schur + * @license https://opensource.org/licenses/mit-license.html MIT + * @author Patrick Schur <patrick_schur@outlook.de> + * @package LanguageDetection + */ +class WhitespaceTokenizer implements TokenizerInterface +{ + /** + * @param string $str + * @return array + */ + public function tokenize(string $str): array + { + return \array_map(function ($word) { + return "_{$word}_"; + }, + \preg_split('/[^\pL]+(?<![\x27\x60\x{2019}])/u', $str, -1, PREG_SPLIT_NO_EMPTY) + ); + } +}
\ No newline at end of file diff --git a/vendor/patrickschur/language-detection/src/LanguageDetection/Trainer.php b/vendor/patrickschur/language-detection/src/LanguageDetection/Trainer.php new file mode 100644 index 000000000..2bc5e6761 --- /dev/null +++ b/vendor/patrickschur/language-detection/src/LanguageDetection/Trainer.php @@ -0,0 +1,50 @@ +<?php + +declare(strict_types = 1); + +namespace LanguageDetection; + +/** + * Class Trainer + * + * @copyright Patrick Schur + * @license https://opensource.org/licenses/mit-license.html MIT + * @author Patrick Schur <patrick_schur@outlook.de> + * @package LanguageDetection + */ +class Trainer extends NgramParser +{ + /** + * Generates language profiles for all language files + * + * @param string $dirname Name of the directory where the translations files are located + * @return void + */ + public function learn(string $dirname = '') + { + if (empty($dirname)) + { + $dirname = __DIR__ . '/../../resources/*/*.txt'; + } + else if (!\is_dir($dirname) || !\is_readable($dirname)) + { + throw new \InvalidArgumentException('Provided directory could not be found or is not readable'); + } + else + { + $dirname = \rtrim($dirname, '/'); + $dirname .= '/*/*.txt'; + } + + /** @var \GlobIterator $txt */ + foreach (new \GlobIterator($dirname) as $txt) + { + $content = \mb_strtolower(\file_get_contents($txt->getPathname())); + + \file_put_contents( + \substr_replace($txt->getPathname(), 'php', -3), + \sprintf("<?php\n\nreturn %s;\n", var_export([ $txt->getBasename('.txt') => $this->getNgrams($content) ], true)) + ); + } + } +} |