language detection library

author: friendica <info@friendica.com> 2012-05-12 01:55:18 -0700
committer: friendica <info@friendica.com> 2012-05-12 01:55:18 -0700
commit: 62727012d37ef3d3cacc413d5667dc2d7bbf9cbb (patch)
tree: 14c222386842938da02f3d43ecb8da12c63e0dc5 /library/langdet/Text/LanguageDetect
parent: 99e4ea19e733f86259e39f5a22d64f1521abc5ae (diff)
download: volse-hubzilla-62727012d37ef3d3cacc413d5667dc2d7bbf9cbb.tar.gz
volse-hubzilla-62727012d37ef3d3cacc413d5667dc2d7bbf9cbb.tar.bz2
volse-hubzilla-62727012d37ef3d3cacc413d5667dc2d7bbf9cbb.zip
3 files changed, 747 insertions, 0 deletions
diff --git a/library/langdet/Text/LanguageDetect/Exception.php b/library/langdet/Text/LanguageDetect/Exception.php
new file mode 100644
index 000000000..196d994f5
--- /dev/null
+++ b/library/langdet/Text/LanguageDetect/Exception.php
@@ -0,0 +1,57 @@
+<?php
+class Text_LanguageDetect_Exception extends Exception
+{
+    /**
+     * Database file could not be found
+     */
+    const DB_NOT_FOUND = 10;
+
+    /**
+     * Database file found, but not readable
+     */
+    const DB_NOT_READABLE = 11;
+
+    /**
+     * Database file is empty
+     */
+    const DB_EMPTY = 12;
+
+    /**
+     * Database contents is not a PHP array
+     */
+    const DB_NOT_ARRAY = 13;
+
+    /**
+     * Magic quotes are activated
+     */
+    const MAGIC_QUOTES = 14;
+
+
+    /**
+     * Parameter of invalid type passed to method
+     */
+    const PARAM_TYPE = 20;
+
+    /**
+     * Character in parameter is invalid
+     */
+    const INVALID_CHAR = 21;
+
+
+    /**
+     * Language is not in the database
+     */
+    const UNKNOWN_LANGUAGE = 30;
+
+
+    /**
+     * Error during block detection
+     */
+    const BLOCK_DETECTION = 40;
+
+
+    /**
+     * Error while clustering languages
+     */
+    const NO_HIGHEST_KEY = 50;
+}
diff --git a/library/langdet/Text/LanguageDetect/ISO639.php b/library/langdet/Text/LanguageDetect/ISO639.php
new file mode 100644
index 000000000..c577a2e1a
--- /dev/null
+++ b/library/langdet/Text/LanguageDetect/ISO639.php
@@ -0,0 +1,341 @@
+<?php
+/**
+ * Part of Text_LanguageDetect
+ *
+ * PHP version 5
+ *
+ * @category  Text
+ * @package   Text_LanguageDetect
+ * @author    Christian Weiske <cweiske@php.net>
+ * @copyright 2011 Christian Weiske <cweiske@php.net>
+ * @license   http://www.debian.org/misc/bsd.license BSD
+ * @version   SVN: $Id$
+ * @link      http://pear.php.net/package/Text_LanguageDetect/
+ */
+
+/**
+ * Provides a mapping between the languages from lang.dat and the
+ * ISO 639-1 and ISO-639-2 codes.
+ *
+ * Note that this class contains only languages that exist in lang.dat.
+ *
+ * @category  Text
+ * @package   Text_LanguageDetect
+ * @author    Christian Weiske <cweiske@php.net>
+ * @copyright 2011 Christian Weiske <cweiske@php.net>
+ * @license   http://www.debian.org/misc/bsd.license BSD
+ * @link      http://www.loc.gov/standards/iso639-2/php/code_list.php
+ */
+class Text_LanguageDetect_ISO639
+{
+    /**
+     * Maps all language names from the language database to the
+     * ISO 639-1 2-letter language code.
+     *
+     * NULL indicates that there is no 2-letter code.
+     *
+     * @var array
+     */
+    public static $nameToCode2 = array(
+        'albanian'   => 'sq',
+        'arabic'     => 'ar',
+        'azeri'      => 'az',
+        'bengali'    => 'bn',
+        'bulgarian'  => 'bg',
+        'cebuano'    => null,
+        'croatian'   => 'hr',
+        'czech'      => 'cs',
+        'danish'     => 'da',
+        'dutch'      => 'nl',
+        'english'    => 'en',
+        'estonian'   => 'et',
+        'farsi'      => 'fa',
+        'finnish'    => 'fi',
+        'french'     => 'fr',
+        'german'     => 'de',
+        'hausa'      => 'ha',
+        'hawaiian'   => null,
+        'hindi'      => 'hi',
+        'hungarian'  => 'hu',
+        'icelandic'  => 'is',
+        'indonesian' => 'id',
+        'italian'    => 'it',
+        'kazakh'     => 'kk',
+        'kyrgyz'     => 'ky',
+        'latin'      => 'la',
+        'latvian'    => 'lv',
+        'lithuanian' => 'lt',
+        'macedonian' => 'mk',
+        'mongolian'  => 'mn',
+        'nepali'     => 'ne',
+        'norwegian'  => 'no',
+        'pashto'     => 'ps',
+        'pidgin'     => null,
+        'polish'     => 'pl',
+        'portuguese' => 'pt',
+        'romanian'   => 'ro',
+        'russian'    => 'ru',
+        'serbian'    => 'sr',
+        'slovak'     => 'sk',
+        'slovene'    => 'sl',
+        'somali'     => 'so',
+        'spanish'    => 'es',
+        'swahili'    => 'sw',
+        'swedish'    => 'sv',
+        'tagalog'    => 'tl',
+        'turkish'    => 'tr',
+        'ukrainian'  => 'uk',
+        'urdu'       => 'ur',
+        'uzbek'      => 'uz',
+        'vietnamese' => 'vi',
+        'welsh'      => 'cy',
+    );
+
+    /**
+     * Maps all language names from the language database to the
+     * ISO 639-2 3-letter language code.
+     *
+     * @var array
+     */
+    public static $nameToCode3 = array(
+        'albanian'   => 'sqi',
+        'arabic'     => 'ara',
+        'azeri'      => 'aze',
+        'bengali'    => 'ben',
+        'bulgarian'  => 'bul',
+        'cebuano'    => 'ceb',
+        'croatian'   => 'hrv',
+        'czech'      => 'ces',
+        'danish'     => 'dan',
+        'dutch'      => 'nld',
+        'english'    => 'eng',
+        'estonian'   => 'est',
+        'farsi'      => 'fas',
+        'finnish'    => 'fin',
+        'french'     => 'fra',
+        'german'     => 'deu',
+        'hausa'      => 'hau',
+        'hawaiian'   => 'haw',
+        'hindi'      => 'hin',
+        'hungarian'  => 'hun',
+        'icelandic'  => 'isl',
+        'indonesian' => 'ind',
+        'italian'    => 'ita',
+        'kazakh'     => 'kaz',
+        'kyrgyz'     => 'kir',
+        'latin'      => 'lat',
+        'latvian'    => 'lav',
+        'lithuanian' => 'lit',
+        'macedonian' => 'mkd',
+        'mongolian'  => 'mon',
+        'nepali'     => 'nep',
+        'norwegian'  => 'nor',
+        'pashto'     => 'pus',
+        'pidgin'     => 'crp',
+        'polish'     => 'pol',
+        'portuguese' => 'por',
+        'romanian'   => 'ron',
+        'russian'    => 'rus',
+        'serbian'    => 'srp',
+        'slovak'     => 'slk',
+        'slovene'    => 'slv',
+        'somali'     => 'som',
+        'spanish'    => 'spa',
+        'swahili'    => 'swa',
+        'swedish'    => 'swe',
+        'tagalog'    => 'tgl',
+        'turkish'    => 'tur',
+        'ukrainian'  => 'ukr',
+        'urdu'       => 'urd',
+        'uzbek'      => 'uzb',
+        'vietnamese' => 'vie',
+        'welsh'      => 'cym',
+    );
+
+    /**
+     * Maps ISO 639-1 2-letter language codes to the language names
+     * in the language database
+     *
+     * Not all languages have a 2 letter code, so some are missing
+     *
+     * @var array
+     */
+    public static $code2ToName = array(
+        'ar' => 'arabic',
+        'az' => 'azeri',
+        'bg' => 'bulgarian',
+        'bn' => 'bengali',
+        'cs' => 'czech',
+        'cy' => 'welsh',
+        'da' => 'danish',
+        'de' => 'german',
+        'en' => 'english',
+        'es' => 'spanish',
+        'et' => 'estonian',
+        'fa' => 'farsi',
+        'fi' => 'finnish',
+        'fr' => 'french',
+        'ha' => 'hausa',
+        'hi' => 'hindi',
+        'hr' => 'croatian',
+        'hu' => 'hungarian',
+        'id' => 'indonesian',
+        'is' => 'icelandic',
+        'it' => 'italian',
+        'kk' => 'kazakh',
+        'ky' => 'kyrgyz',
+        'la' => 'latin',
+        'lt' => 'lithuanian',
+        'lv' => 'latvian',
+        'mk' => 'macedonian',
+        'mn' => 'mongolian',
+        'ne' => 'nepali',
+        'nl' => 'dutch',
+        'no' => 'norwegian',
+        'pl' => 'polish',
+        'ps' => 'pashto',
+        'pt' => 'portuguese',
+        'ro' => 'romanian',
+        'ru' => 'russian',
+        'sk' => 'slovak',
+        'sl' => 'slovene',
+        'so' => 'somali',
+        'sq' => 'albanian',
+        'sr' => 'serbian',
+        'sv' => 'swedish',
+        'sw' => 'swahili',
+        'tl' => 'tagalog',
+        'tr' => 'turkish',
+        'uk' => 'ukrainian',
+        'ur' => 'urdu',
+        'uz' => 'uzbek',
+        'vi' => 'vietnamese',
+    );
+
+    /**
+     * Maps ISO 639-2 3-letter language codes to the language names
+     * in the language database.
+     *
+     * @var array
+     */
+    public static $code3ToName = array(
+        'ara' => 'arabic',
+        'aze' => 'azeri',
+        'ben' => 'bengali',
+        'bul' => 'bulgarian',
+        'ceb' => 'cebuano',
+        'ces' => 'czech',
+        'crp' => 'pidgin',
+        'cym' => 'welsh',
+        'dan' => 'danish',
+        'deu' => 'german',
+        'eng' => 'english',
+        'est' => 'estonian',
+        'fas' => 'farsi',
+        'fin' => 'finnish',
+        'fra' => 'french',
+        'hau' => 'hausa',
+        'haw' => 'hawaiian',
+        'hin' => 'hindi',
+        'hrv' => 'croatian',
+        'hun' => 'hungarian',
+        'ind' => 'indonesian',
+        'isl' => 'icelandic',
+        'ita' => 'italian',
+        'kaz' => 'kazakh',
+        'kir' => 'kyrgyz',
+        'lat' => 'latin',
+        'lav' => 'latvian',
+        'lit' => 'lithuanian',
+        'mkd' => 'macedonian',
+        'mon' => 'mongolian',
+        'nep' => 'nepali',
+        'nld' => 'dutch',
+        'nor' => 'norwegian',
+        'pol' => 'polish',
+        'por' => 'portuguese',
+        'pus' => 'pashto',
+        'rom' => 'romanian',
+        'rus' => 'russian',
+        'slk' => 'slovak',
+        'slv' => 'slovene',
+        'som' => 'somali',
+        'spa' => 'spanish',
+        'sqi' => 'albanian',
+        'srp' => 'serbian',
+        'swa' => 'swahili',
+        'swe' => 'swedish',
+        'tgl' => 'tagalog',
+        'tur' => 'turkish',
+        'ukr' => 'ukrainian',
+        'urd' => 'urdu',
+        'uzb' => 'uzbek',
+        'vie' => 'vietnamese',
+    );
+
+    /**
+     * Returns the 2-letter ISO 639-1 code for the given language name.
+     *
+     * @param string $lang English language name like "swedish"
+     *
+     * @return string Two-letter language code (e.g. "sv") or NULL if not found
+     */
+    public static function nameToCode2($lang)
+    {
+        $lang = strtolower($lang);
+        if (!isset(self::$nameToCode2[$lang])) {
+            return null;
+        }
+        return self::$nameToCode2[$lang];
+    }
+
+    /**
+     * Returns the 3-letter ISO 639-2 code for the given language name.
+     *
+     * @param string $lang English language name like "swedish"
+     *
+     * @return string Three-letter language code (e.g. "swe") or NULL if not found
+     */
+    public static function nameToCode3($lang)
+    {
+        $lang = strtolower($lang);
+        if (!isset(self::$nameToCode3[$lang])) {
+            return null;
+        }
+        return self::$nameToCode3[$lang];
+    }
+
+    /**
+     * Returns the language name for the given 2-letter ISO 639-1 code.
+     *
+     * @param string $code Two-letter language code (e.g. "sv")
+     *
+     * @return string English language name like "swedish"
+     */
+    public static function code2ToName($code)
+    {
+        $lang = strtolower($code);
+        if (!isset(self::$code2ToName[$code])) {
+            return null;
+        }
+        return self::$code2ToName[$code];
+    }
+
+    /**
+     * Returns the language name for the given 3-letter ISO 639-2 code.
+     *
+     * @param string $code Three-letter language code (e.g. "swe")
+     *
+     * @return string English language name like "swedish"
+     */
+    public static function code3ToName($code)
+    {
+        $lang = strtolower($code);
+        if (!isset(self::$code3ToName[$code])) {
+            return null;
+        }
+        return self::$code3ToName[$code];
+    }
+}
+
+?>
+\ No newline at end of file
diff --git a/library/langdet/Text/LanguageDetect/Parser.php b/library/langdet/Text/LanguageDetect/Parser.php
new file mode 100644
index 000000000..1c20c2657
--- /dev/null
+++ b/library/langdet/Text/LanguageDetect/Parser.php
@@ -0,0 +1,349 @@
+<?php
+
+/**
+ * This class represents a text sample to be parsed.
+ *
+ * @category    Text
+ * @package     Text_LanguageDetect
+ * @author      Nicholas Pisarro
+ * @copyright   2006
+ * @license     BSD
+ * @version     CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $
+ * @link        http://pear.php.net/package/Text_LanguageDetect/
+ * @link        http://langdetect.blogspot.com/
+ */
+
+/**
+ * This class represents a text sample to be parsed.
+ *
+ * This separates the analysis of a text sample from the primary LanguageDetect
+ * class. After a new profile has been built, the data can be retrieved using
+ * the accessor functions.
+ *
+ * This class is intended to be used by the Text_LanguageDetect class, not 
+ * end-users.
+ *
+ * @category    Text
+ * @package     Text_LanguageDetect
+ * @author      Nicholas Pisarro
+ * @copyright   2006
+ * @license     BSD
+ * @version     release: 0.3.0
+ */
+class Text_LanguageDetect_Parser extends Text_LanguageDetect
+{
+    /**
+     * the piece of text being parsed
+     *
+     * @access  private
+     * @var     string
+     */
+    var $_string;
+
+    /**
+     * stores the trigram frequencies of the sample
+     *
+     * @access  private
+     * @var     string
+     */
+    var $_trigrams = array();
+
+    /**
+     * stores the trigram ranks of the sample
+     *
+     * @access  private
+     * @var     array
+     */
+    var $_trigram_ranks = array();
+
+    /**
+     * stores the unicode blocks of the sample
+     *
+     * @access  private
+     * @var     array
+     */
+    var $_unicode_blocks = array();
+    
+    /**
+     * Whether the parser should compile the unicode ranges
+     * 
+     * @access  private
+     * @var     bool
+     */
+    var $_compile_unicode = false;
+
+    /**
+     * Whether the parser should compile trigrams
+     *
+     * @access  private
+     * @var     bool
+     */
+    var $_compile_trigram = false;
+
+    /**
+     * Whether the trigram parser should pad the beginning of the string
+     *
+     * @access  private
+     * @var     bool
+     */
+    var $_trigram_pad_start = false;
+
+    /**
+     * Whether the unicode parser should skip non-alphabetical ascii chars
+     *
+     * @access  private
+     * @var     bool
+     */
+    var $_unicode_skip_symbols = true;
+
+    /**
+     * Constructor
+     *
+     * @access  private
+     * @param   string  $string     string to be parsed
+     */
+    function Text_LanguageDetect_Parser($string) {
+        $this->_string = $string;
+    }
+
+    /**
+     * Returns true if a string is suitable for parsing
+     *
+     * @param   string  $str    input string to test
+     * @return  bool            true if acceptable, false if not
+     */
+    public static function validateString($str) {
+        if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) {
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    /**
+     * turn on/off trigram counting
+     *
+     * @access  public
+     * @param   bool    $bool true for on, false for off
+     */
+    function prepareTrigram($bool = true)
+    {
+        $this->_compile_trigram = $bool;
+    }
+
+    /**
+     * turn on/off unicode block counting
+     *
+     * @access  public
+     * @param   bool    $bool true for on, false for off
+     */
+    function prepareUnicode($bool = true)
+    {
+        $this->_compile_unicode = $bool;
+    }
+
+    /**
+     * turn on/off padding the beginning of the sample string
+     *
+     * @access  public
+     * @param   bool    $bool true for on, false for off
+     */
+    function setPadStart($bool = true)
+    {
+        $this->_trigram_pad_start = $bool;
+    }
+
+    /**
+     * Should the unicode block counter skip non-alphabetical ascii chars?
+     *
+     * @access  public
+     * @param   bool    $bool true for on, false for off
+     */
+    function setUnicodeSkipSymbols($bool = true)
+    {
+        $this->_unicode_skip_symbols = $bool;
+    }
+
+    /**
+     * Returns the trigram ranks for the text sample
+     *
+     * @access  public
+     * @return  array    trigram ranks in the text sample
+     */
+    function &getTrigramRanks()
+    {
+        return $this->_trigram_ranks;
+    }
+
+    /**
+     * Return the trigram freqency table
+     *
+     * only used in testing to make sure the parser is working
+     *
+     * @access  public
+     * @return  array    trigram freqencies in the text sample
+     */
+    function &getTrigramFreqs()
+    {
+        return $this->_trigram;
+    }
+
+    /**
+     * returns the array of unicode blocks
+     *
+     * @access  public
+     * @return  array   unicode blocks in the text sample
+     */
+    function &getUnicodeBlocks()
+    {
+        return $this->_unicode_blocks;
+    }
+
+    /**
+     * Executes the parsing operation
+     * 
+     * Be sure to call the set*() functions to set options and the 
+     * prepare*() functions first to tell it what kind of data to compute
+     *
+     * Afterwards the get*() functions can be used to access the compiled
+     * information.
+     *
+     * @access public
+     */
+    function analyze()
+    {
+        $len = strlen($this->_string);
+        $byte_counter = 0;
+
+
+        // unicode startup
+        if ($this->_compile_unicode) {
+            $blocks = $this->_read_unicode_block_db();
+            $block_count = count($blocks);
+
+            $skipped_count = 0;
+            $unicode_chars = array();
+        }
+
+        // trigram startup
+        if ($this->_compile_trigram) {
+            // initialize them as blank so the parser will skip the first two
+            // (since it skips trigrams with more than  2 contiguous spaces)
+            $a = ' ';
+            $b = ' ';
+
+            // kludge
+            // if it finds a valid trigram to start and the start pad option is
+            // off, then set a variable that will be used to reduce this
+            // trigram after parsing has finished
+            if (!$this->_trigram_pad_start) {
+                $a = $this->_next_char($this->_string, $byte_counter, true);
+
+                if ($a != ' ') {
+                    $b = $this->_next_char($this->_string, $byte_counter, true);
+                    $dropone = " $a$b";
+                }
+
+                $byte_counter = 0;
+                $a = ' ';
+                $b = ' ';
+            }
+        }
+
+        while ($byte_counter < $len) {
+            $char = $this->_next_char($this->_string, $byte_counter, true);
+
+
+            // language trigram detection
+            if ($this->_compile_trigram) {
+                if (!($b == ' ' && ($a == ' ' || $char == ' '))) {
+                    if (!isset($this->_trigram[$a . $b . $char])) {
+                       $this->_trigram[$a . $b . $char] = 1;
+                    } else {
+                       $this->_trigram[$a . $b . $char]++;
+                    }
+                }
+
+                $a = $b;
+                $b = $char;
+            }
+
+            // unicode block detection
+            if ($this->_compile_unicode) {
+                if ($this->_unicode_skip_symbols
+                        && strlen($char) == 1
+                        && ($char < 'A' || $char > 'z'
+                        || ($char > 'Z' && $char < 'a'))
+                        && $char != "'") {  // does not skip the apostrophe
+                                            // since it's included in the language
+                                            // models
+
+                    $skipped_count++;
+                    continue;
+                }
+
+                // build an array of all the characters
+                if (isset($unicode_chars[$char])) {
+                    $unicode_chars[$char]++;
+                } else {
+                    $unicode_chars[$char] = 1;
+                }
+            }
+
+            // todo: add byte detection here
+        }
+
+        // unicode cleanup
+        if ($this->_compile_unicode) {
+            foreach ($unicode_chars as $utf8_char => $count) {
+                $search_result = $this->_unicode_block_name(
+                        $this->_utf8char2unicode($utf8_char), $blocks, $block_count);
+
+                if ($search_result != -1) {
+                    $block_name = $search_result[2];
+                } else {
+                    $block_name = '[Malformatted]';
+                }
+
+                if (isset($this->_unicode_blocks[$block_name])) {
+                    $this->_unicode_blocks[$block_name] += $count;
+                } else {
+                    $this->_unicode_blocks[$block_name] = $count;
+                }
+            }
+        }
+
+
+        // trigram cleanup
+        if ($this->_compile_trigram) {
+            // pad the end
+            if ($b != ' ') {
+                if (!isset($this->_trigram["$a$b "])) {
+                    $this->_trigram["$a$b "] = 1;
+                } else {
+                    $this->_trigram["$a$b "]++;
+                }
+            }
+
+            // perl compatibility; Language::Guess does not pad the beginning
+            // kludge
+            if (isset($dropone)) {
+                if ($this->_trigram[$dropone] == 1) {
+                    unset($this->_trigram[$dropone]);
+                } else {
+                    $this->_trigram[$dropone]--;
+                }
+            }
+
+            if (!empty($this->_trigram)) {
+                $this->_trigram_ranks = $this->_arr_rank($this->_trigram);
+            } else {
+                $this->_trigram_ranks = array();
+            }
+        }
+    }
+}
+
+/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
+
+?>
author	friendica <info@friendica.com>	2012-05-12 01:55:18 -0700
committer	friendica <info@friendica.com>	2012-05-12 01:55:18 -0700
commit	62727012d37ef3d3cacc413d5667dc2d7bbf9cbb (patch)
tree	14c222386842938da02f3d43ecb8da12c63e0dc5 /library/langdet/Text/LanguageDetect
parent	99e4ea19e733f86259e39f5a22d64f1521abc5ae (diff)
download	volse-hubzilla-62727012d37ef3d3cacc413d5667dc2d7bbf9cbb.tar.gz volse-hubzilla-62727012d37ef3d3cacc413d5667dc2d7bbf9cbb.tar.bz2 volse-hubzilla-62727012d37ef3d3cacc413d5667dc2d7bbf9cbb.zip