blob: 68bb6013a8e34dd537f815d99b6694fac90a575d (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
<?php
declare(strict_types = 1);
namespace LanguageDetection\Tokenizer;
/**
* Class WhitespaceTokenizer
*
* @copyright Patrick Schur
* @license https://opensource.org/licenses/mit-license.html MIT
* @author Patrick Schur <patrick_schur@outlook.de>
* @package LanguageDetection
*/
class WhitespaceTokenizer implements TokenizerInterface
{
/**
* @param string $str
* @return array
*/
public function tokenize(string $str): array
{
return \array_map(function ($word) {
return "_{$word}_";
},
\preg_split('/[^\pL]+(?<![\x27\x60\x{2019}])/u', $str, -1, PREG_SPLIT_NO_EMPTY)
);
}
}
|