diff options
author | Klaus Weidenbach <Klaus.Weidenbach@gmx.net> | 2017-10-25 01:57:18 +0200 |
---|---|---|
committer | Klaus Weidenbach <Klaus.Weidenbach@gmx.net> | 2017-10-29 22:00:06 +0100 |
commit | 8e4c5db766ce23d05b8507991b04fece743147de (patch) | |
tree | 55c89f2c145f47245e7d32380c92256051d6a8f2 /vendor/pear/text_languagedetect/Text/LanguageDetect/Parser.php | |
parent | fe5f1e4d67d999ed3c6ef78dc4d49f5dd1a93056 (diff) | |
download | volse-hubzilla-8e4c5db766ce23d05b8507991b04fece743147de.tar.gz volse-hubzilla-8e4c5db766ce23d05b8507991b04fece743147de.tar.bz2 volse-hubzilla-8e4c5db766ce23d05b8507991b04fece743147de.zip |
:arrow_up: Update Text_LanguageDetect.
Update from v0.3.0 (2012) to v1.0.0 (2017) which should remove some
warnings and improve PHP7 support.
Using composer to handle this PEAR library now.
Fix a problem in FeedutilsTest.
Diffstat (limited to 'vendor/pear/text_languagedetect/Text/LanguageDetect/Parser.php')
-rw-r--r-- | vendor/pear/text_languagedetect/Text/LanguageDetect/Parser.php | 358 |
1 files changed, 358 insertions, 0 deletions
diff --git a/vendor/pear/text_languagedetect/Text/LanguageDetect/Parser.php b/vendor/pear/text_languagedetect/Text/LanguageDetect/Parser.php new file mode 100644 index 000000000..3ec177640 --- /dev/null +++ b/vendor/pear/text_languagedetect/Text/LanguageDetect/Parser.php @@ -0,0 +1,358 @@ +<?php +/** + * Part of Text_LanguageDetect + * + * PHP version 5 + * + * @category Text + * @package Text_LanguageDetect + * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> + * @copyright 2006 Nicholas Pisarro + * @license BSD http://www.opensource.org/licenses/bsd-license.php + * @link http://pear.php.net/package/Text_LanguageDetect/ + */ + +/** + * This class represents a text sample to be parsed. + * + * This separates the analysis of a text sample from the primary LanguageDetect + * class. After a new profile has been built, the data can be retrieved using + * the accessor functions. + * + * This class is intended to be used by the Text_LanguageDetect class, not + * end-users. + * + * @category Text + * @package Text_LanguageDetect + * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com> + * @copyright 2006 Nicholas Pisarro + * @license BSD http://www.opensource.org/licenses/bsd-license.php + * @version Release: @package_version@ + * @link http://pear.php.net/package/Text_LanguageDetect/ + */ +class Text_LanguageDetect_Parser extends Text_LanguageDetect +{ + /** + * The piece of text being parsed + * + * @var string + */ + protected $_string; + + /** + * Stores the trigram frequencies of the sample + * + * @var string + */ + protected $_trigrams = array(); + + /** + * Stores the trigram ranks of the sample + * + * @var array + */ + protected $_trigram_ranks = array(); + + /** + * Stores the unicode blocks of the sample + * + * @var array + */ + protected $_unicode_blocks = array(); + + /** + * Whether the parser should compile the unicode ranges + * + * @var bool + */ + protected $_compile_unicode = false; + + /** + * Whether the parser should compile trigrams + * + * @var bool + */ + protected $_compile_trigram = false; + + /** + * Whether the trigram parser should pad the beginning of the string + * + * @var bool + */ + protected $_trigram_pad_start = false; + + /** + * Whether the unicode parser should skip non-alphabetical ascii chars + * + * @var bool + */ + protected $_unicode_skip_symbols = true; + + /** + * Constructor + * + * @param string $string string to be parsed + */ + public function __construct($string) + { + $this->_string = $string; + } + + /** + * PHP 4 constructor for backwards compatibility. + * + * @param string $string string to be parsed + * + * @return void + */ + public function Text_LanguageDetect_Parser($string) + { + self::__construct($string); + } + + /** + * Returns true if a string is suitable for parsing + * + * @param string $str input string to test + * + * @return bool true if acceptable, false if not + */ + public static function validateString($str) + { + if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { + return true; + } else { + return false; + } + } + + /** + * Turn on/off trigram counting + * + * @param bool $bool true for on, false for off + * + * @return void + */ + public function prepareTrigram($bool = true) + { + $this->_compile_trigram = $bool; + } + + /** + * Turn on/off unicode block counting + * + * @param bool $bool true for on, false for off + * + * @return void + */ + public function prepareUnicode($bool = true) + { + $this->_compile_unicode = $bool; + } + + /** + * Turn on/off padding the beginning of the sample string + * + * @param bool $bool true for on, false for off + * + * @return void + */ + public function setPadStart($bool = true) + { + $this->_trigram_pad_start = $bool; + } + + /** + * Should the unicode block counter skip non-alphabetical ascii chars? + * + * @param bool $bool true for on, false for off + * + * @return void + */ + public function setUnicodeSkipSymbols($bool = true) + { + $this->_unicode_skip_symbols = $bool; + } + + /** + * Returns the trigram ranks for the text sample + * + * @return array Trigram ranks in the text sample + */ + public function getTrigramRanks() + { + return $this->_trigram_ranks; + } + + /** + * Return the trigram freqency table + * + * Only used in testing to make sure the parser is working + * + * @return array Trigram freqencies in the text sample + */ + public function getTrigramFreqs() + { + return $this->_trigram; + } + + /** + * Returns the array of unicode blocks + * + * @return array Unicode blocks in the text sample + */ + public function getUnicodeBlocks() + { + return $this->_unicode_blocks; + } + + /** + * Executes the parsing operation + * + * Be sure to call the set*() functions to set options and the + * prepare*() functions first to tell it what kind of data to compute + * + * Afterwards the get*() functions can be used to access the compiled + * information. + * + * @return void + */ + public function analyze() + { + $len = strlen($this->_string); + $byte_counter = 0; + + + // unicode startup + if ($this->_compile_unicode) { + $blocks = $this->_read_unicode_block_db(); + $block_count = count($blocks); + + $skipped_count = 0; + $unicode_chars = array(); + } + + // trigram startup + if ($this->_compile_trigram) { + // initialize them as blank so the parser will skip the first two + // (since it skips trigrams with more than 2 contiguous spaces) + $a = ' '; + $b = ' '; + + // kludge + // if it finds a valid trigram to start and the start pad option is + // off, then set a variable that will be used to reduce this + // trigram after parsing has finished + if (!$this->_trigram_pad_start) { + $a = $this->_next_char($this->_string, $byte_counter, true); + + if ($a != ' ') { + $b = $this->_next_char($this->_string, $byte_counter, true); + $dropone = " $a$b"; + } + + $byte_counter = 0; + $a = ' '; + $b = ' '; + } + } + + while ($byte_counter < $len) { + $char = $this->_next_char($this->_string, $byte_counter, true); + + + // language trigram detection + if ($this->_compile_trigram) { + if (!($b == ' ' && ($a == ' ' || $char == ' '))) { + if (!isset($this->_trigram[$a . $b . $char])) { + $this->_trigram[$a . $b . $char] = 1; + } else { + $this->_trigram[$a . $b . $char]++; + } + } + + $a = $b; + $b = $char; + } + + // unicode block detection + if ($this->_compile_unicode) { + if ($this->_unicode_skip_symbols + && strlen($char) == 1 + && ($char < 'A' || $char > 'z' + || ($char > 'Z' && $char < 'a')) + && $char != "'" + ) { // does not skip the apostrophe + // since it's included in the language + // models + + $skipped_count++; + continue; + } + + // build an array of all the characters + if (isset($unicode_chars[$char])) { + $unicode_chars[$char]++; + } else { + $unicode_chars[$char] = 1; + } + } + + // todo: add byte detection here + } + + // unicode cleanup + if ($this->_compile_unicode) { + foreach ($unicode_chars as $utf8_char => $count) { + $search_result = $this->_unicode_block_name( + $this->_utf8char2unicode($utf8_char), $blocks, $block_count + ); + + if ($search_result != -1) { + $block_name = $search_result[2]; + } else { + $block_name = '[Malformatted]'; + } + + if (isset($this->_unicode_blocks[$block_name])) { + $this->_unicode_blocks[$block_name] += $count; + } else { + $this->_unicode_blocks[$block_name] = $count; + } + } + } + + + // trigram cleanup + if ($this->_compile_trigram) { + // pad the end + if ($b != ' ') { + if (!isset($this->_trigram["$a$b "])) { + $this->_trigram["$a$b "] = 1; + } else { + $this->_trigram["$a$b "]++; + } + } + + // perl compatibility; Language::Guess does not pad the beginning + // kludge + if (isset($dropone)) { + if ($this->_trigram[$dropone] == 1) { + unset($this->_trigram[$dropone]); + } else { + $this->_trigram[$dropone]--; + } + } + + if (!empty($this->_trigram)) { + $this->_trigram_ranks = $this->_arr_rank($this->_trigram); + } else { + $this->_trigram_ranks = array(); + } + } + } +} + +/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ + +?> |