aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/patrickschur/language-detection/src
diff options
context:
space:
mode:
authorMario <mario@mariovavti.com>2023-11-25 17:12:28 +0100
committerMario <mario@mariovavti.com>2023-11-25 17:12:28 +0100
commit0fd8e02a884a2b040dca62ab5d9674db5f6a070b (patch)
tree586ee43f32f6f14368c09026f21dcd3244ea24b6 /vendor/patrickschur/language-detection/src
parent82e704ec5b107823c09f1387e9091adee53a4c2d (diff)
parent55c4bfb67009c598f25b1a8189604bfffa73dfbb (diff)
downloadvolse-hubzilla-0fd8e02a884a2b040dca62ab5d9674db5f6a070b.tar.gz
volse-hubzilla-0fd8e02a884a2b040dca62ab5d9674db5f6a070b.tar.bz2
volse-hubzilla-0fd8e02a884a2b040dca62ab5d9674db5f6a070b.zip
Merge branch '8.8RC'8.8
Diffstat (limited to 'vendor/patrickschur/language-detection/src')
-rw-r--r--vendor/patrickschur/language-detection/src/LanguageDetection/Language.php102
-rw-r--r--vendor/patrickschur/language-detection/src/LanguageDetection/LanguageResult.php149
-rw-r--r--vendor/patrickschur/language-detection/src/LanguageDetection/NgramParser.php153
-rw-r--r--vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/TokenizerInterface.php18
-rw-r--r--vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/WhitespaceTokenizer.php29
-rw-r--r--vendor/patrickschur/language-detection/src/LanguageDetection/Trainer.php50
6 files changed, 501 insertions, 0 deletions
diff --git a/vendor/patrickschur/language-detection/src/LanguageDetection/Language.php b/vendor/patrickschur/language-detection/src/LanguageDetection/Language.php
new file mode 100644
index 000000000..c369a3367
--- /dev/null
+++ b/vendor/patrickschur/language-detection/src/LanguageDetection/Language.php
@@ -0,0 +1,102 @@
+<?php
+
+declare(strict_types = 1);
+
+namespace LanguageDetection;
+
+/**
+ * Class Language
+ *
+ * @copyright Patrick Schur
+ * @license https://opensource.org/licenses/mit-license.html MIT
+ * @author Patrick Schur <patrick_schur@outlook.de>
+ * @package LanguageDetection
+ */
+class Language extends NgramParser
+{
+ /**
+ * @var array<string, array<string, int>>
+ */
+ protected $tokens = [];
+
+ /**
+ * Loads all language files
+ *
+ * @param array $lang List of ISO 639-1 codes, that should be used in the detection phase
+ * @param string $dirname Name of the directory where the translations files are located
+ */
+ public function __construct(array $lang = [], string $dirname = '')
+ {
+ if (empty($dirname))
+ {
+ $dirname = __DIR__ . '/../../resources/*/*.php';
+ }
+ else if (!\is_dir($dirname) || !\is_readable($dirname))
+ {
+ throw new \InvalidArgumentException('Provided directory could not be found or is not readable');
+ }
+ else
+ {
+ $dirname = \rtrim($dirname, '/');
+ $dirname .= '/*/*.php';
+ }
+
+ $isEmpty = empty($lang);
+ $tokens = [];
+
+ foreach (\glob($dirname) as $file)
+ {
+ if ($isEmpty || \in_array(\basename($file, '.php'), $lang))
+ {
+ $tokens += require $file;
+ }
+ }
+
+ foreach ($tokens as $lang => $value) {
+ $this->tokens[$lang] = \array_flip($value);
+ }
+ }
+
+ /**
+ * Detects the language from a given text string
+ *
+ * @param string $str
+ * @return LanguageResult
+ */
+ public function detect(string $str): LanguageResult
+ {
+ $str = \mb_strtolower($str);
+
+ $samples = $this->getNgrams($str);
+
+ $result = [];
+
+ if (\count($samples) > 0)
+ {
+ foreach ($this->tokens as $lang => $value)
+ {
+ $index = $sum = 0;
+
+ foreach ($samples as $v)
+ {
+ if (isset($value[$v]))
+ {
+ $x = $index++ - $value[$v];
+ $y = $x >> (PHP_INT_SIZE * 8);
+ $sum += ($x + $y) ^ $y;
+ continue;
+ }
+
+ $sum += $this->maxNgrams;
+ ++$index;
+ }
+
+ $result[$lang] = 1 - ($sum / ($this->maxNgrams * $index));
+ }
+
+ \arsort($result, SORT_NUMERIC);
+ }
+
+ return new LanguageResult($result);
+ }
+}
diff --git a/vendor/patrickschur/language-detection/src/LanguageDetection/LanguageResult.php b/vendor/patrickschur/language-detection/src/LanguageDetection/LanguageResult.php
new file mode 100644
index 000000000..5b89ff44e
--- /dev/null
+++ b/vendor/patrickschur/language-detection/src/LanguageDetection/LanguageResult.php
@@ -0,0 +1,149 @@
+<?php
+
+declare(strict_types = 1);
+
+namespace LanguageDetection;
+
+/**
+ * Class LanguageResult
+ *
+ * @copyright Patrick Schur
+ * @license https://opensource.org/licenses/mit-license.html MIT
+ * @author Patrick Schur <patrick_schur@outlook.de>
+ * @package LanguageDetection
+ */
+class LanguageResult implements \JsonSerializable, \IteratorAggregate, \ArrayAccess
+{
+ const THRESHOLD = .025;
+
+ /**
+ * @var array
+ */
+ private $result = [];
+
+ /**
+ * LanguageResult constructor.
+ * @param array $result
+ */
+ public function __construct(array $result = [])
+ {
+ $this->result = $result;
+ }
+
+ /**
+ * @param mixed $offset
+ * @return bool
+ */
+ public function offsetExists($offset): bool
+ {
+ return isset($this->result[$offset]);
+ }
+
+ /**
+ * @param mixed $offset
+ * @return mixed|null
+ */
+ public function offsetGet($offset): ?float
+ {
+ return $this->result[$offset] ?? null;
+ }
+
+ /**
+ * @param mixed $offset
+ * @param mixed $value
+ * @return void
+ */
+ public function offsetSet($offset, $value): void
+ {
+ if (null === $offset) {
+ $this->result[] = $value;
+ } else {
+ $this->result[$offset] = $value;
+ }
+ }
+
+ /**
+ * @param mixed $offset
+ */
+ public function offsetUnset($offset): void
+ {
+ unset($this->result[$offset]);
+ }
+
+ /**
+ * @return array
+ */
+ public function jsonSerialize(): array
+ {
+ return $this->result;
+ }
+
+ /**
+ * @return string
+ */
+ public function __toString(): string
+ {
+ return (string) \key($this->result);
+ }
+
+ /**
+ * @param \string[] ...$whitelist
+ * @return LanguageResult
+ */
+ public function whitelist(string ...$whitelist): LanguageResult
+ {
+ return new LanguageResult(\array_intersect_key($this->result, \array_flip($whitelist)));
+ }
+
+ /**
+ * @param \string[] ...$blacklist
+ * @return LanguageResult
+ */
+ public function blacklist(string ...$blacklist): LanguageResult
+ {
+ return new LanguageResult(\array_diff_key($this->result, \array_flip($blacklist)));
+ }
+
+ /**
+ * @return array
+ */
+ public function close(): array
+ {
+ return $this->result;
+ }
+
+ /**
+ * @return LanguageResult
+ */
+ public function bestResults(): LanguageResult
+ {
+ if (!\count($this->result))
+ {
+ return new LanguageResult;
+ }
+
+ $first = \array_values($this->result)[0];
+
+ return new LanguageResult(\array_filter($this->result, function ($value) use ($first) {
+ return ($first - $value) <= self::THRESHOLD ? true : false;
+ }));
+ }
+
+ /**
+ * @return \ArrayIterator
+ */
+ public function getIterator(): \ArrayIterator
+ {
+ return new \ArrayIterator($this->result);
+ }
+
+ /**
+ * @param int $offset
+ * @param int|null $length
+ * @return LanguageResult
+ */
+ public function limit(int $offset, int $length = null): LanguageResult
+ {
+ return new LanguageResult(\array_slice($this->result, $offset, $length));
+ }
+}
diff --git a/vendor/patrickschur/language-detection/src/LanguageDetection/NgramParser.php b/vendor/patrickschur/language-detection/src/LanguageDetection/NgramParser.php
new file mode 100644
index 000000000..8b69241eb
--- /dev/null
+++ b/vendor/patrickschur/language-detection/src/LanguageDetection/NgramParser.php
@@ -0,0 +1,153 @@
+<?php
+
+declare(strict_types = 1);
+
+namespace LanguageDetection;
+
+use LanguageDetection\Tokenizer\TokenizerInterface;
+use LanguageDetection\Tokenizer\WhitespaceTokenizer;
+
+/**
+ * Class NgramParser
+ *
+ * @copyright Patrick Schur
+ * @license https://opensource.org/licenses/mit-license.html MIT
+ * @author Patrick Schur <patrick_schur@outlook.de>
+ * @package LanguageDetection
+ */
+abstract class NgramParser
+{
+ /**
+ * @var int
+ */
+ protected $minLength = 1;
+
+ /**
+ * @var int
+ */
+ protected $maxLength = 3;
+
+ /**
+ * @var int
+ */
+ protected $maxNgrams = 310;
+
+ /**
+ * @var TokenizerInterface
+ */
+ protected $tokenizer = null;
+
+ /**
+ * @param int $minLength
+ * @throws \LengthException
+ */
+ public function setMinLength(int $minLength)
+ {
+ if ($minLength <= 0 || $minLength >= $this->maxLength)
+ {
+ throw new \LengthException('$minLength must be greater than zero and less than $this->maxLength.');
+ }
+
+ $this->minLength = $minLength;
+ }
+
+ /**
+ * @param int $maxLength
+ * @throws \LengthException
+ */
+ public function setMaxLength(int $maxLength)
+ {
+ if ($maxLength <= $this->minLength)
+ {
+ throw new \LengthException('$maxLength must be greater than $this->minLength.');
+ }
+
+ $this->maxLength = $maxLength;
+ }
+
+ /**
+ * @param int $maxNgrams
+ * @throws \LengthException
+ */
+ public function setMaxNgrams(int $maxNgrams)
+ {
+ if ($maxNgrams <= 0)
+ {
+ throw new \LengthException('$maxNgrams must be greater than zero.');
+ }
+
+ $this->maxNgrams = $maxNgrams;
+ }
+
+ /**
+ * Sets the tokenizer
+ *
+ * @param TokenizerInterface $tokenizer
+ */
+ public function setTokenizer(TokenizerInterface $tokenizer)
+ {
+ $this->tokenizer = $tokenizer;
+ }
+
+ /**
+ * @param string $str
+ * @return array
+ */
+ private function tokenize(string $str)
+ {
+ if (null === $this->tokenizer)
+ {
+ $this->tokenizer = new WhitespaceTokenizer();
+ }
+
+ return $this->tokenizer->tokenize($str);
+ }
+
+ /**
+ * @param string $str
+ * @return array
+ */
+ protected function getNgrams(string $str): array
+ {
+ $tokens = [];
+
+ foreach ($this->tokenize($str) as $word)
+ {
+ $l = \mb_strlen($word);
+
+ for ($i = $this->minLength; $i <= $this->maxLength; ++$i)
+ {
+ for ($j = 0; ($i + $j - 1) < $l; ++$j, ++$tmp)
+ {
+ $tmp = &$tokens[$i][\mb_substr($word, $j, $i)];
+ }
+ }
+ }
+
+ foreach ($tokens as $i => $token)
+ {
+ $sum = \array_sum($token);
+
+ foreach ($token as $j => $value)
+ {
+ $tokens[$i][$j] = $value / $sum;
+ }
+ }
+
+ if (!\count($tokens))
+ {
+ return [];
+ }
+
+ $tokens = \array_merge(...$tokens);
+ unset($tokens['_']);
+
+ \arsort($tokens, SORT_NUMERIC);
+
+ return \array_slice(
+ \array_keys($tokens),
+ 0,
+ $this->maxNgrams
+ );
+ }
+}
diff --git a/vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/TokenizerInterface.php b/vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/TokenizerInterface.php
new file mode 100644
index 000000000..f06074628
--- /dev/null
+++ b/vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/TokenizerInterface.php
@@ -0,0 +1,18 @@
+<?php
+
+declare(strict_types = 1);
+
+namespace LanguageDetection\Tokenizer;
+
+/**
+ * Interface TokenizerInterface
+ *
+ * @copyright Patrick Schur
+ * @license https://opensource.org/licenses/mit-license.html MIT
+ * @author Patrick Schur <patrick_schur@outlook.de>
+ * @package LanguageDetection
+ */
+interface TokenizerInterface
+{
+ public function tokenize(string $str): array;
+} \ No newline at end of file
diff --git a/vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/WhitespaceTokenizer.php b/vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/WhitespaceTokenizer.php
new file mode 100644
index 000000000..68bb6013a
--- /dev/null
+++ b/vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/WhitespaceTokenizer.php
@@ -0,0 +1,29 @@
+<?php
+
+declare(strict_types = 1);
+
+namespace LanguageDetection\Tokenizer;
+
+/**
+ * Class WhitespaceTokenizer
+ *
+ * @copyright Patrick Schur
+ * @license https://opensource.org/licenses/mit-license.html MIT
+ * @author Patrick Schur <patrick_schur@outlook.de>
+ * @package LanguageDetection
+ */
+class WhitespaceTokenizer implements TokenizerInterface
+{
+ /**
+ * @param string $str
+ * @return array
+ */
+ public function tokenize(string $str): array
+ {
+ return \array_map(function ($word) {
+ return "_{$word}_";
+ },
+ \preg_split('/[^\pL]+(?<![\x27\x60\x{2019}])/u', $str, -1, PREG_SPLIT_NO_EMPTY)
+ );
+ }
+} \ No newline at end of file
diff --git a/vendor/patrickschur/language-detection/src/LanguageDetection/Trainer.php b/vendor/patrickschur/language-detection/src/LanguageDetection/Trainer.php
new file mode 100644
index 000000000..2bc5e6761
--- /dev/null
+++ b/vendor/patrickschur/language-detection/src/LanguageDetection/Trainer.php
@@ -0,0 +1,50 @@
+<?php
+
+declare(strict_types = 1);
+
+namespace LanguageDetection;
+
+/**
+ * Class Trainer
+ *
+ * @copyright Patrick Schur
+ * @license https://opensource.org/licenses/mit-license.html MIT
+ * @author Patrick Schur <patrick_schur@outlook.de>
+ * @package LanguageDetection
+ */
+class Trainer extends NgramParser
+{
+ /**
+ * Generates language profiles for all language files
+ *
+ * @param string $dirname Name of the directory where the translations files are located
+ * @return void
+ */
+ public function learn(string $dirname = '')
+ {
+ if (empty($dirname))
+ {
+ $dirname = __DIR__ . '/../../resources/*/*.txt';
+ }
+ else if (!\is_dir($dirname) || !\is_readable($dirname))
+ {
+ throw new \InvalidArgumentException('Provided directory could not be found or is not readable');
+ }
+ else
+ {
+ $dirname = \rtrim($dirname, '/');
+ $dirname .= '/*/*.txt';
+ }
+
+ /** @var \GlobIterator $txt */
+ foreach (new \GlobIterator($dirname) as $txt)
+ {
+ $content = \mb_strtolower(\file_get_contents($txt->getPathname()));
+
+ \file_put_contents(
+ \substr_replace($txt->getPathname(), 'php', -3),
+ \sprintf("<?php\n\nreturn %s;\n", var_export([ $txt->getBasename('.txt') => $this->getNgrams($content) ], true))
+ );
+ }
+ }
+}