aboutsummaryrefslogtreecommitdiffstats
path: root/library/langdet/Text/LanguageDetect/Parser.php
diff options
context:
space:
mode:
authorfriendica <info@friendica.com>2012-05-12 01:55:18 -0700
committerfriendica <info@friendica.com>2012-05-12 01:55:18 -0700
commit62727012d37ef3d3cacc413d5667dc2d7bbf9cbb (patch)
tree14c222386842938da02f3d43ecb8da12c63e0dc5 /library/langdet/Text/LanguageDetect/Parser.php
parent99e4ea19e733f86259e39f5a22d64f1521abc5ae (diff)
downloadvolse-hubzilla-62727012d37ef3d3cacc413d5667dc2d7bbf9cbb.tar.gz
volse-hubzilla-62727012d37ef3d3cacc413d5667dc2d7bbf9cbb.tar.bz2
volse-hubzilla-62727012d37ef3d3cacc413d5667dc2d7bbf9cbb.zip
language detection library
Diffstat (limited to 'library/langdet/Text/LanguageDetect/Parser.php')
-rw-r--r--library/langdet/Text/LanguageDetect/Parser.php349
1 files changed, 349 insertions, 0 deletions
diff --git a/library/langdet/Text/LanguageDetect/Parser.php b/library/langdet/Text/LanguageDetect/Parser.php
new file mode 100644
index 000000000..1c20c2657
--- /dev/null
+++ b/library/langdet/Text/LanguageDetect/Parser.php
@@ -0,0 +1,349 @@
+<?php
+
+/**
+ * This class represents a text sample to be parsed.
+ *
+ * @category Text
+ * @package Text_LanguageDetect
+ * @author Nicholas Pisarro
+ * @copyright 2006
+ * @license BSD
+ * @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
+ * @link http://pear.php.net/package/Text_LanguageDetect/
+ * @link http://langdetect.blogspot.com/
+ */
+
+/**
+ * This class represents a text sample to be parsed.
+ *
+ * This separates the analysis of a text sample from the primary LanguageDetect
+ * class. After a new profile has been built, the data can be retrieved using
+ * the accessor functions.
+ *
+ * This class is intended to be used by the Text_LanguageDetect class, not
+ * end-users.
+ *
+ * @category Text
+ * @package Text_LanguageDetect
+ * @author Nicholas Pisarro
+ * @copyright 2006
+ * @license BSD
+ * @version release: 0.3.0
+ */
+class Text_LanguageDetect_Parser extends Text_LanguageDetect
+{
+ /**
+ * the piece of text being parsed
+ *
+ * @access private
+ * @var string
+ */
+ var $_string;
+
+ /**
+ * stores the trigram frequencies of the sample
+ *
+ * @access private
+ * @var string
+ */
+ var $_trigrams = array();
+
+ /**
+ * stores the trigram ranks of the sample
+ *
+ * @access private
+ * @var array
+ */
+ var $_trigram_ranks = array();
+
+ /**
+ * stores the unicode blocks of the sample
+ *
+ * @access private
+ * @var array
+ */
+ var $_unicode_blocks = array();
+
+ /**
+ * Whether the parser should compile the unicode ranges
+ *
+ * @access private
+ * @var bool
+ */
+ var $_compile_unicode = false;
+
+ /**
+ * Whether the parser should compile trigrams
+ *
+ * @access private
+ * @var bool
+ */
+ var $_compile_trigram = false;
+
+ /**
+ * Whether the trigram parser should pad the beginning of the string
+ *
+ * @access private
+ * @var bool
+ */
+ var $_trigram_pad_start = false;
+
+ /**
+ * Whether the unicode parser should skip non-alphabetical ascii chars
+ *
+ * @access private
+ * @var bool
+ */
+ var $_unicode_skip_symbols = true;
+
+ /**
+ * Constructor
+ *
+ * @access private
+ * @param string $string string to be parsed
+ */
+ function Text_LanguageDetect_Parser($string) {
+ $this->_string = $string;
+ }
+
+ /**
+ * Returns true if a string is suitable for parsing
+ *
+ * @param string $str input string to test
+ * @return bool true if acceptable, false if not
+ */
+ public static function validateString($str) {
+ if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * turn on/off trigram counting
+ *
+ * @access public
+ * @param bool $bool true for on, false for off
+ */
+ function prepareTrigram($bool = true)
+ {
+ $this->_compile_trigram = $bool;
+ }
+
+ /**
+ * turn on/off unicode block counting
+ *
+ * @access public
+ * @param bool $bool true for on, false for off
+ */
+ function prepareUnicode($bool = true)
+ {
+ $this->_compile_unicode = $bool;
+ }
+
+ /**
+ * turn on/off padding the beginning of the sample string
+ *
+ * @access public
+ * @param bool $bool true for on, false for off
+ */
+ function setPadStart($bool = true)
+ {
+ $this->_trigram_pad_start = $bool;
+ }
+
+ /**
+ * Should the unicode block counter skip non-alphabetical ascii chars?
+ *
+ * @access public
+ * @param bool $bool true for on, false for off
+ */
+ function setUnicodeSkipSymbols($bool = true)
+ {
+ $this->_unicode_skip_symbols = $bool;
+ }
+
+ /**
+ * Returns the trigram ranks for the text sample
+ *
+ * @access public
+ * @return array trigram ranks in the text sample
+ */
+ function &getTrigramRanks()
+ {
+ return $this->_trigram_ranks;
+ }
+
+ /**
+ * Return the trigram freqency table
+ *
+ * only used in testing to make sure the parser is working
+ *
+ * @access public
+ * @return array trigram freqencies in the text sample
+ */
+ function &getTrigramFreqs()
+ {
+ return $this->_trigram;
+ }
+
+ /**
+ * returns the array of unicode blocks
+ *
+ * @access public
+ * @return array unicode blocks in the text sample
+ */
+ function &getUnicodeBlocks()
+ {
+ return $this->_unicode_blocks;
+ }
+
+ /**
+ * Executes the parsing operation
+ *
+ * Be sure to call the set*() functions to set options and the
+ * prepare*() functions first to tell it what kind of data to compute
+ *
+ * Afterwards the get*() functions can be used to access the compiled
+ * information.
+ *
+ * @access public
+ */
+ function analyze()
+ {
+ $len = strlen($this->_string);
+ $byte_counter = 0;
+
+
+ // unicode startup
+ if ($this->_compile_unicode) {
+ $blocks = $this->_read_unicode_block_db();
+ $block_count = count($blocks);
+
+ $skipped_count = 0;
+ $unicode_chars = array();
+ }
+
+ // trigram startup
+ if ($this->_compile_trigram) {
+ // initialize them as blank so the parser will skip the first two
+ // (since it skips trigrams with more than 2 contiguous spaces)
+ $a = ' ';
+ $b = ' ';
+
+ // kludge
+ // if it finds a valid trigram to start and the start pad option is
+ // off, then set a variable that will be used to reduce this
+ // trigram after parsing has finished
+ if (!$this->_trigram_pad_start) {
+ $a = $this->_next_char($this->_string, $byte_counter, true);
+
+ if ($a != ' ') {
+ $b = $this->_next_char($this->_string, $byte_counter, true);
+ $dropone = " $a$b";
+ }
+
+ $byte_counter = 0;
+ $a = ' ';
+ $b = ' ';
+ }
+ }
+
+ while ($byte_counter < $len) {
+ $char = $this->_next_char($this->_string, $byte_counter, true);
+
+
+ // language trigram detection
+ if ($this->_compile_trigram) {
+ if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
+ if (!isset($this->_trigram[$a . $b . $char])) {
+ $this->_trigram[$a . $b . $char] = 1;
+ } else {
+ $this->_trigram[$a . $b . $char]++;
+ }
+ }
+
+ $a = $b;
+ $b = $char;
+ }
+
+ // unicode block detection
+ if ($this->_compile_unicode) {
+ if ($this->_unicode_skip_symbols
+ && strlen($char) == 1
+ && ($char < 'A' || $char > 'z'
+ || ($char > 'Z' && $char < 'a'))
+ && $char != "'") { // does not skip the apostrophe
+ // since it's included in the language
+ // models
+
+ $skipped_count++;
+ continue;
+ }
+
+ // build an array of all the characters
+ if (isset($unicode_chars[$char])) {
+ $unicode_chars[$char]++;
+ } else {
+ $unicode_chars[$char] = 1;
+ }
+ }
+
+ // todo: add byte detection here
+ }
+
+ // unicode cleanup
+ if ($this->_compile_unicode) {
+ foreach ($unicode_chars as $utf8_char => $count) {
+ $search_result = $this->_unicode_block_name(
+ $this->_utf8char2unicode($utf8_char), $blocks, $block_count);
+
+ if ($search_result != -1) {
+ $block_name = $search_result[2];
+ } else {
+ $block_name = '[Malformatted]';
+ }
+
+ if (isset($this->_unicode_blocks[$block_name])) {
+ $this->_unicode_blocks[$block_name] += $count;
+ } else {
+ $this->_unicode_blocks[$block_name] = $count;
+ }
+ }
+ }
+
+
+ // trigram cleanup
+ if ($this->_compile_trigram) {
+ // pad the end
+ if ($b != ' ') {
+ if (!isset($this->_trigram["$a$b "])) {
+ $this->_trigram["$a$b "] = 1;
+ } else {
+ $this->_trigram["$a$b "]++;
+ }
+ }
+
+ // perl compatibility; Language::Guess does not pad the beginning
+ // kludge
+ if (isset($dropone)) {
+ if ($this->_trigram[$dropone] == 1) {
+ unset($this->_trigram[$dropone]);
+ } else {
+ $this->_trigram[$dropone]--;
+ }
+ }
+
+ if (!empty($this->_trigram)) {
+ $this->_trigram_ranks = $this->_arr_rank($this->_trigram);
+ } else {
+ $this->_trigram_ranks = array();
+ }
+ }
+ }
+}
+
+/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
+
+?>