blob: 68bb6013a8e34dd537f815d99b6694fac90a575d (
plain) (
tree)
|
|
<?php
declare(strict_types = 1);
namespace LanguageDetection\Tokenizer;
/**
* Class WhitespaceTokenizer
*
* @copyright Patrick Schur
* @license https://opensource.org/licenses/mit-license.html MIT
* @author Patrick Schur <patrick_schur@outlook.de>
* @package LanguageDetection
*/
class WhitespaceTokenizer implements TokenizerInterface
{
/**
* @param string $str
* @return array
*/
public function tokenize(string $str): array
{
return \array_map(function ($word) {
return "_{$word}_";
},
\preg_split('/[^\pL]+(?<![\x27\x60\x{2019}])/u', $str, -1, PREG_SPLIT_NO_EMPTY)
);
}
}
|