aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/patrickschur/language-detection/src/LanguageDetection/Tokenizer/WhitespaceTokenizer.php
blob: 68bb6013a8e34dd537f815d99b6694fac90a575d (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
<?php

declare(strict_types = 1);

namespace LanguageDetection\Tokenizer;

/**
 * Class WhitespaceTokenizer
 *
 * @copyright Patrick Schur
 * @license https://opensource.org/licenses/mit-license.html MIT
 * @author Patrick Schur <patrick_schur@outlook.de>
 * @package LanguageDetection
 */
class WhitespaceTokenizer implements TokenizerInterface
{
    /**
     * @param string $str
     * @return array
     */
    public function tokenize(string $str): array
    {
        return \array_map(function ($word) {
                return "_{$word}_";
            },
            \preg_split('/[^\pL]+(?<![\x27\x60\x{2019}])/u', $str, -1, PREG_SPLIT_NO_EMPTY)
        );
    }
}