From 2f2e353ecef52639a78cac3bc407ccfe64197ac9 Mon Sep 17 00:00:00 2001 From: Mario Date: Sat, 7 Oct 2023 16:00:34 +0000 Subject: use new lang detect library which supports much more languages --- .../Tokenizer/WhitespaceTokenizer.php | 29 ++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/WhitespaceTokenizer.php (limited to 'vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/WhitespaceTokenizer.php') diff --git a/vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/WhitespaceTokenizer.php b/vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/WhitespaceTokenizer.php new file mode 100644 index 000000000..68bb6013a --- /dev/null +++ b/vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/WhitespaceTokenizer.php @@ -0,0 +1,29 @@ + + * @package LanguageDetection + */ +class WhitespaceTokenizer implements TokenizerInterface +{ + /** + * @param string $str + * @return array + */ + public function tokenize(string $str): array + { + return \array_map(function ($word) { + return "_{$word}_"; + }, + \preg_split('/[^\pL]+(?