aboutsummaryrefslogtreecommitdiffstats
path: root/vendor/pear/text_languagedetect/Text/LanguageDetect/Parser.php
diff options
context:
space:
mode:
authorKlaus Weidenbach <Klaus.Weidenbach@gmx.net>2017-10-25 01:57:18 +0200
committerKlaus Weidenbach <Klaus.Weidenbach@gmx.net>2017-10-29 22:00:06 +0100
commit8e4c5db766ce23d05b8507991b04fece743147de (patch)
tree55c89f2c145f47245e7d32380c92256051d6a8f2 /vendor/pear/text_languagedetect/Text/LanguageDetect/Parser.php
parentfe5f1e4d67d999ed3c6ef78dc4d49f5dd1a93056 (diff)
downloadvolse-hubzilla-8e4c5db766ce23d05b8507991b04fece743147de.tar.gz
volse-hubzilla-8e4c5db766ce23d05b8507991b04fece743147de.tar.bz2
volse-hubzilla-8e4c5db766ce23d05b8507991b04fece743147de.zip
:arrow_up: Update Text_LanguageDetect.
Update from v0.3.0 (2012) to v1.0.0 (2017) which should remove some warnings and improve PHP7 support. Using composer to handle this PEAR library now. Fix a problem in FeedutilsTest.
Diffstat (limited to 'vendor/pear/text_languagedetect/Text/LanguageDetect/Parser.php')
-rw-r--r--vendor/pear/text_languagedetect/Text/LanguageDetect/Parser.php358
1 files changed, 358 insertions, 0 deletions
diff --git a/vendor/pear/text_languagedetect/Text/LanguageDetect/Parser.php b/vendor/pear/text_languagedetect/Text/LanguageDetect/Parser.php
new file mode 100644
index 000000000..3ec177640
--- /dev/null
+++ b/vendor/pear/text_languagedetect/Text/LanguageDetect/Parser.php
@@ -0,0 +1,358 @@
+<?php
+/**
+ * Part of Text_LanguageDetect
+ *
+ * PHP version 5
+ *
+ * @category Text
+ * @package Text_LanguageDetect
+ * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
+ * @copyright 2006 Nicholas Pisarro
+ * @license BSD http://www.opensource.org/licenses/bsd-license.php
+ * @link http://pear.php.net/package/Text_LanguageDetect/
+ */
+
+/**
+ * This class represents a text sample to be parsed.
+ *
+ * This separates the analysis of a text sample from the primary LanguageDetect
+ * class. After a new profile has been built, the data can be retrieved using
+ * the accessor functions.
+ *
+ * This class is intended to be used by the Text_LanguageDetect class, not
+ * end-users.
+ *
+ * @category Text
+ * @package Text_LanguageDetect
+ * @author Nicholas Pisarro <infinityminusnine+pear@gmail.com>
+ * @copyright 2006 Nicholas Pisarro
+ * @license BSD http://www.opensource.org/licenses/bsd-license.php
+ * @version Release: @package_version@
+ * @link http://pear.php.net/package/Text_LanguageDetect/
+ */
+class Text_LanguageDetect_Parser extends Text_LanguageDetect
+{
+ /**
+ * The piece of text being parsed
+ *
+ * @var string
+ */
+ protected $_string;
+
+ /**
+ * Stores the trigram frequencies of the sample
+ *
+ * @var string
+ */
+ protected $_trigrams = array();
+
+ /**
+ * Stores the trigram ranks of the sample
+ *
+ * @var array
+ */
+ protected $_trigram_ranks = array();
+
+ /**
+ * Stores the unicode blocks of the sample
+ *
+ * @var array
+ */
+ protected $_unicode_blocks = array();
+
+ /**
+ * Whether the parser should compile the unicode ranges
+ *
+ * @var bool
+ */
+ protected $_compile_unicode = false;
+
+ /**
+ * Whether the parser should compile trigrams
+ *
+ * @var bool
+ */
+ protected $_compile_trigram = false;
+
+ /**
+ * Whether the trigram parser should pad the beginning of the string
+ *
+ * @var bool
+ */
+ protected $_trigram_pad_start = false;
+
+ /**
+ * Whether the unicode parser should skip non-alphabetical ascii chars
+ *
+ * @var bool
+ */
+ protected $_unicode_skip_symbols = true;
+
+ /**
+ * Constructor
+ *
+ * @param string $string string to be parsed
+ */
+ public function __construct($string)
+ {
+ $this->_string = $string;
+ }
+
+ /**
+ * PHP 4 constructor for backwards compatibility.
+ *
+ * @param string $string string to be parsed
+ *
+ * @return void
+ */
+ public function Text_LanguageDetect_Parser($string)
+ {
+ self::__construct($string);
+ }
+
+ /**
+ * Returns true if a string is suitable for parsing
+ *
+ * @param string $str input string to test
+ *
+ * @return bool true if acceptable, false if not
+ */
+ public static function validateString($str)
+ {
+ if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Turn on/off trigram counting
+ *
+ * @param bool $bool true for on, false for off
+ *
+ * @return void
+ */
+ public function prepareTrigram($bool = true)
+ {
+ $this->_compile_trigram = $bool;
+ }
+
+ /**
+ * Turn on/off unicode block counting
+ *
+ * @param bool $bool true for on, false for off
+ *
+ * @return void
+ */
+ public function prepareUnicode($bool = true)
+ {
+ $this->_compile_unicode = $bool;
+ }
+
+ /**
+ * Turn on/off padding the beginning of the sample string
+ *
+ * @param bool $bool true for on, false for off
+ *
+ * @return void
+ */
+ public function setPadStart($bool = true)
+ {
+ $this->_trigram_pad_start = $bool;
+ }
+
+ /**
+ * Should the unicode block counter skip non-alphabetical ascii chars?
+ *
+ * @param bool $bool true for on, false for off
+ *
+ * @return void
+ */
+ public function setUnicodeSkipSymbols($bool = true)
+ {
+ $this->_unicode_skip_symbols = $bool;
+ }
+
+ /**
+ * Returns the trigram ranks for the text sample
+ *
+ * @return array Trigram ranks in the text sample
+ */
+ public function getTrigramRanks()
+ {
+ return $this->_trigram_ranks;
+ }
+
+ /**
+ * Return the trigram freqency table
+ *
+ * Only used in testing to make sure the parser is working
+ *
+ * @return array Trigram freqencies in the text sample
+ */
+ public function getTrigramFreqs()
+ {
+ return $this->_trigram;
+ }
+
+ /**
+ * Returns the array of unicode blocks
+ *
+ * @return array Unicode blocks in the text sample
+ */
+ public function getUnicodeBlocks()
+ {
+ return $this->_unicode_blocks;
+ }
+
+ /**
+ * Executes the parsing operation
+ *
+ * Be sure to call the set*() functions to set options and the
+ * prepare*() functions first to tell it what kind of data to compute
+ *
+ * Afterwards the get*() functions can be used to access the compiled
+ * information.
+ *
+ * @return void
+ */
+ public function analyze()
+ {
+ $len = strlen($this->_string);
+ $byte_counter = 0;
+
+
+ // unicode startup
+ if ($this->_compile_unicode) {
+ $blocks = $this->_read_unicode_block_db();
+ $block_count = count($blocks);
+
+ $skipped_count = 0;
+ $unicode_chars = array();
+ }
+
+ // trigram startup
+ if ($this->_compile_trigram) {
+ // initialize them as blank so the parser will skip the first two
+ // (since it skips trigrams with more than 2 contiguous spaces)
+ $a = ' ';
+ $b = ' ';
+
+ // kludge
+ // if it finds a valid trigram to start and the start pad option is
+ // off, then set a variable that will be used to reduce this
+ // trigram after parsing has finished
+ if (!$this->_trigram_pad_start) {
+ $a = $this->_next_char($this->_string, $byte_counter, true);
+
+ if ($a != ' ') {
+ $b = $this->_next_char($this->_string, $byte_counter, true);
+ $dropone = " $a$b";
+ }
+
+ $byte_counter = 0;
+ $a = ' ';
+ $b = ' ';
+ }
+ }
+
+ while ($byte_counter < $len) {
+ $char = $this->_next_char($this->_string, $byte_counter, true);
+
+
+ // language trigram detection
+ if ($this->_compile_trigram) {
+ if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
+ if (!isset($this->_trigram[$a . $b . $char])) {
+ $this->_trigram[$a . $b . $char] = 1;
+ } else {
+ $this->_trigram[$a . $b . $char]++;
+ }
+ }
+
+ $a = $b;
+ $b = $char;
+ }
+
+ // unicode block detection
+ if ($this->_compile_unicode) {
+ if ($this->_unicode_skip_symbols
+ && strlen($char) == 1
+ && ($char < 'A' || $char > 'z'
+ || ($char > 'Z' && $char < 'a'))
+ && $char != "'"
+ ) { // does not skip the apostrophe
+ // since it's included in the language
+ // models
+
+ $skipped_count++;
+ continue;
+ }
+
+ // build an array of all the characters
+ if (isset($unicode_chars[$char])) {
+ $unicode_chars[$char]++;
+ } else {
+ $unicode_chars[$char] = 1;
+ }
+ }
+
+ // todo: add byte detection here
+ }
+
+ // unicode cleanup
+ if ($this->_compile_unicode) {
+ foreach ($unicode_chars as $utf8_char => $count) {
+ $search_result = $this->_unicode_block_name(
+ $this->_utf8char2unicode($utf8_char), $blocks, $block_count
+ );
+
+ if ($search_result != -1) {
+ $block_name = $search_result[2];
+ } else {
+ $block_name = '[Malformatted]';
+ }
+
+ if (isset($this->_unicode_blocks[$block_name])) {
+ $this->_unicode_blocks[$block_name] += $count;
+ } else {
+ $this->_unicode_blocks[$block_name] = $count;
+ }
+ }
+ }
+
+
+ // trigram cleanup
+ if ($this->_compile_trigram) {
+ // pad the end
+ if ($b != ' ') {
+ if (!isset($this->_trigram["$a$b "])) {
+ $this->_trigram["$a$b "] = 1;
+ } else {
+ $this->_trigram["$a$b "]++;
+ }
+ }
+
+ // perl compatibility; Language::Guess does not pad the beginning
+ // kludge
+ if (isset($dropone)) {
+ if ($this->_trigram[$dropone] == 1) {
+ unset($this->_trigram[$dropone]);
+ } else {
+ $this->_trigram[$dropone]--;
+ }
+ }
+
+ if (!empty($this->_trigram)) {
+ $this->_trigram_ranks = $this->_arr_rank($this->_trigram);
+ } else {
+ $this->_trigram_ranks = array();
+ }
+ }
+ }
+}
+
+/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
+
+?>