:arrow_up: Update Text_LanguageDetect.

Update from v0.3.0 (2012) to v1.0.0 (2017) which should remove some warnings and improve PHP7 support. Using composer to handle this PEAR library now. Fix a problem in FeedutilsTest.
author: Klaus Weidenbach <Klaus.Weidenbach@gmx.net> 2017-10-25 01:57:18 +0200
committer: Klaus Weidenbach <Klaus.Weidenbach@gmx.net> 2017-10-29 22:00:06 +0100
commit: 8e4c5db766ce23d05b8507991b04fece743147de (patch)
tree: 55c89f2c145f47245e7d32380c92256051d6a8f2 /library/langdet/Text/LanguageDetect/Parser.php
parent: fe5f1e4d67d999ed3c6ef78dc4d49f5dd1a93056 (diff)
download: volse-hubzilla-8e4c5db766ce23d05b8507991b04fece743147de.tar.gz
volse-hubzilla-8e4c5db766ce23d05b8507991b04fece743147de.tar.bz2
volse-hubzilla-8e4c5db766ce23d05b8507991b04fece743147de.zip
1 files changed, 0 insertions, 349 deletions
diff --git a/library/langdet/Text/LanguageDetect/Parser.php b/library/langdet/Text/LanguageDetect/Parser.php
deleted file mode 100644
index 1c20c2657..000000000
--- a/library/langdet/Text/LanguageDetect/Parser.php
+++ /dev/null
@@ -1,349 +0,0 @@
-<?php
-
-/**
- * This class represents a text sample to be parsed.
- *
- * @category    Text
- * @package     Text_LanguageDetect
- * @author      Nicholas Pisarro
- * @copyright   2006
- * @license     BSD
- * @version     CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
- * @link        http://pear.php.net/package/Text_LanguageDetect/
- * @link        http://langdetect.blogspot.com/
- */
-
-/**
- * This class represents a text sample to be parsed.
- *
- * This separates the analysis of a text sample from the primary LanguageDetect
- * class. After a new profile has been built, the data can be retrieved using
- * the accessor functions.
- *
- * This class is intended to be used by the Text_LanguageDetect class, not 
- * end-users.
- *
- * @category    Text
- * @package     Text_LanguageDetect
- * @author      Nicholas Pisarro
- * @copyright   2006
- * @license     BSD
- * @version     release: 0.3.0
- */
-class Text_LanguageDetect_Parser extends Text_LanguageDetect
-{
-    /**
-     * the piece of text being parsed
-     *
-     * @access  private
-     * @var     string
-     */
-    var $_string;
-
-    /**
-     * stores the trigram frequencies of the sample
-     *
-     * @access  private
-     * @var     string
-     */
-    var $_trigrams = array();
-
-    /**
-     * stores the trigram ranks of the sample
-     *
-     * @access  private
-     * @var     array
-     */
-    var $_trigram_ranks = array();
-
-    /**
-     * stores the unicode blocks of the sample
-     *
-     * @access  private
-     * @var     array
-     */
-    var $_unicode_blocks = array();
-    
-    /**
-     * Whether the parser should compile the unicode ranges
-     * 
-     * @access  private
-     * @var     bool
-     */
-    var $_compile_unicode = false;
-
-    /**
-     * Whether the parser should compile trigrams
-     *
-     * @access  private
-     * @var     bool
-     */
-    var $_compile_trigram = false;
-
-    /**
-     * Whether the trigram parser should pad the beginning of the string
-     *
-     * @access  private
-     * @var     bool
-     */
-    var $_trigram_pad_start = false;
-
-    /**
-     * Whether the unicode parser should skip non-alphabetical ascii chars
-     *
-     * @access  private
-     * @var     bool
-     */
-    var $_unicode_skip_symbols = true;
-
-    /**
-     * Constructor
-     *
-     * @access  private
-     * @param   string  $string     string to be parsed
-     */
-    function Text_LanguageDetect_Parser($string) {
-        $this->_string = $string;
-    }
-
-    /**
-     * Returns true if a string is suitable for parsing
-     *
-     * @param   string  $str    input string to test
-     * @return  bool            true if acceptable, false if not
-     */
-    public static function validateString($str) {
-        if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
-            return true;
-        } else {
-            return false;
-        }
-    }
-
-    /**
-     * turn on/off trigram counting
-     *
-     * @access  public
-     * @param   bool    $bool true for on, false for off
-     */
-    function prepareTrigram($bool = true)
-    {
-        $this->_compile_trigram = $bool;
-    }
-
-    /**
-     * turn on/off unicode block counting
-     *
-     * @access  public
-     * @param   bool    $bool true for on, false for off
-     */
-    function prepareUnicode($bool = true)
-    {
-        $this->_compile_unicode = $bool;
-    }
-
-    /**
-     * turn on/off padding the beginning of the sample string
-     *
-     * @access  public
-     * @param   bool    $bool true for on, false for off
-     */
-    function setPadStart($bool = true)
-    {
-        $this->_trigram_pad_start = $bool;
-    }
-
-    /**
-     * Should the unicode block counter skip non-alphabetical ascii chars?
-     *
-     * @access  public
-     * @param   bool    $bool true for on, false for off
-     */
-    function setUnicodeSkipSymbols($bool = true)
-    {
-        $this->_unicode_skip_symbols = $bool;
-    }
-
-    /**
-     * Returns the trigram ranks for the text sample
-     *
-     * @access  public
-     * @return  array    trigram ranks in the text sample
-     */
-    function &getTrigramRanks()
-    {
-        return $this->_trigram_ranks;
-    }
-
-    /**
-     * Return the trigram freqency table
-     *
-     * only used in testing to make sure the parser is working
-     *
-     * @access  public
-     * @return  array    trigram freqencies in the text sample
-     */
-    function &getTrigramFreqs()
-    {
-        return $this->_trigram;
-    }
-
-    /**
-     * returns the array of unicode blocks
-     *
-     * @access  public
-     * @return  array   unicode blocks in the text sample
-     */
-    function &getUnicodeBlocks()
-    {
-        return $this->_unicode_blocks;
-    }
-
-    /**
-     * Executes the parsing operation
-     * 
-     * Be sure to call the set*() functions to set options and the 
-     * prepare*() functions first to tell it what kind of data to compute
-     *
-     * Afterwards the get*() functions can be used to access the compiled
-     * information.
-     *
-     * @access public
-     */
-    function analyze()
-    {
-        $len = strlen($this->_string);
-        $byte_counter = 0;
-
-
-        // unicode startup
-        if ($this->_compile_unicode) {
-            $blocks = $this->_read_unicode_block_db();
-            $block_count = count($blocks);
-
-            $skipped_count = 0;
-            $unicode_chars = array();
-        }
-
-        // trigram startup
-        if ($this->_compile_trigram) {
-            // initialize them as blank so the parser will skip the first two
-            // (since it skips trigrams with more than  2 contiguous spaces)
-            $a = ' ';
-            $b = ' ';
-
-            // kludge
-            // if it finds a valid trigram to start and the start pad option is
-            // off, then set a variable that will be used to reduce this
-            // trigram after parsing has finished
-            if (!$this->_trigram_pad_start) {
-                $a = $this->_next_char($this->_string, $byte_counter, true);
-
-                if ($a != ' ') {
-                    $b = $this->_next_char($this->_string, $byte_counter, true);
-                    $dropone = " $a$b";
-                }
-
-                $byte_counter = 0;
-                $a = ' ';
-                $b = ' ';
-            }
-        }
-
-        while ($byte_counter < $len) {
-            $char = $this->_next_char($this->_string, $byte_counter, true);
-
-
-            // language trigram detection
-            if ($this->_compile_trigram) {
-                if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
-                    if (!isset($this->_trigram[$a . $b . $char])) {
-                       $this->_trigram[$a . $b . $char] = 1;
-                    } else {
-                       $this->_trigram[$a . $b . $char]++;
-                    }
-                }
-
-                $a = $b;
-                $b = $char;
-            }
-
-            // unicode block detection
-            if ($this->_compile_unicode) {
-                if ($this->_unicode_skip_symbols
-                        && strlen($char) == 1
-                        && ($char < 'A' || $char > 'z'
-                        || ($char > 'Z' && $char < 'a'))
-                        && $char != "'") {  // does not skip the apostrophe
-                                            // since it's included in the language
-                                            // models
-
-                    $skipped_count++;
-                    continue;
-                }
-
-                // build an array of all the characters
-                if (isset($unicode_chars[$char])) {
-                    $unicode_chars[$char]++;
-                } else {
-                    $unicode_chars[$char] = 1;
-                }
-            }
-
-            // todo: add byte detection here
-        }
-
-        // unicode cleanup
-        if ($this->_compile_unicode) {
-            foreach ($unicode_chars as $utf8_char => $count) {
-                $search_result = $this->_unicode_block_name(
-                        $this->_utf8char2unicode($utf8_char), $blocks, $block_count);
-
-                if ($search_result != -1) {
-                    $block_name = $search_result[2];
-                } else {
-                    $block_name = '[Malformatted]';
-                }
-
-                if (isset($this->_unicode_blocks[$block_name])) {
-                    $this->_unicode_blocks[$block_name] += $count;
-                } else {
-                    $this->_unicode_blocks[$block_name] = $count;
-                }
-            }
-        }
-
-
-        // trigram cleanup
-        if ($this->_compile_trigram) {
-            // pad the end
-            if ($b != ' ') {
-                if (!isset($this->_trigram["$a$b "])) {
-                    $this->_trigram["$a$b "] = 1;
-                } else {
-                    $this->_trigram["$a$b "]++;
-                }
-            }
-
-            // perl compatibility; Language::Guess does not pad the beginning
-            // kludge
-            if (isset($dropone)) {
-                if ($this->_trigram[$dropone] == 1) {
-                    unset($this->_trigram[$dropone]);
-                } else {
-                    $this->_trigram[$dropone]--;
-                }
-            }
-
-            if (!empty($this->_trigram)) {
-                $this->_trigram_ranks = $this->_arr_rank($this->_trigram);
-            } else {
-                $this->_trigram_ranks = array();
-            }
-        }
-    }
-}
-
-/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
-
-?>
author	Klaus Weidenbach <Klaus.Weidenbach@gmx.net>	2017-10-25 01:57:18 +0200
committer	Klaus Weidenbach <Klaus.Weidenbach@gmx.net>	2017-10-29 22:00:06 +0100
commit	8e4c5db766ce23d05b8507991b04fece743147de (patch)
tree	55c89f2c145f47245e7d32380c92256051d6a8f2 /library/langdet/Text/LanguageDetect/Parser.php
parent	fe5f1e4d67d999ed3c6ef78dc4d49f5dd1a93056 (diff)
download	volse-hubzilla-8e4c5db766ce23d05b8507991b04fece743147de.tar.gz volse-hubzilla-8e4c5db766ce23d05b8507991b04fece743147de.tar.bz2 volse-hubzilla-8e4c5db766ce23d05b8507991b04fece743147de.zip