diff options
Diffstat (limited to 'library/langdet/Text/LanguageDetect')
-rw-r--r-- | library/langdet/Text/LanguageDetect/Exception.php | 57 | ||||
-rw-r--r-- | library/langdet/Text/LanguageDetect/ISO639.php | 341 | ||||
-rw-r--r-- | library/langdet/Text/LanguageDetect/Parser.php | 349 |
3 files changed, 0 insertions, 747 deletions
diff --git a/library/langdet/Text/LanguageDetect/Exception.php b/library/langdet/Text/LanguageDetect/Exception.php deleted file mode 100644 index 196d994f5..000000000 --- a/library/langdet/Text/LanguageDetect/Exception.php +++ /dev/null @@ -1,57 +0,0 @@ -<?php -class Text_LanguageDetect_Exception extends Exception -{ - /** - * Database file could not be found - */ - const DB_NOT_FOUND = 10; - - /** - * Database file found, but not readable - */ - const DB_NOT_READABLE = 11; - - /** - * Database file is empty - */ - const DB_EMPTY = 12; - - /** - * Database contents is not a PHP array - */ - const DB_NOT_ARRAY = 13; - - /** - * Magic quotes are activated - */ - const MAGIC_QUOTES = 14; - - - /** - * Parameter of invalid type passed to method - */ - const PARAM_TYPE = 20; - - /** - * Character in parameter is invalid - */ - const INVALID_CHAR = 21; - - - /** - * Language is not in the database - */ - const UNKNOWN_LANGUAGE = 30; - - - /** - * Error during block detection - */ - const BLOCK_DETECTION = 40; - - - /** - * Error while clustering languages - */ - const NO_HIGHEST_KEY = 50; -} diff --git a/library/langdet/Text/LanguageDetect/ISO639.php b/library/langdet/Text/LanguageDetect/ISO639.php deleted file mode 100644 index c577a2e1a..000000000 --- a/library/langdet/Text/LanguageDetect/ISO639.php +++ /dev/null @@ -1,341 +0,0 @@ -<?php -/** - * Part of Text_LanguageDetect - * - * PHP version 5 - * - * @category Text - * @package Text_LanguageDetect - * @author Christian Weiske <cweiske@php.net> - * @copyright 2011 Christian Weiske <cweiske@php.net> - * @license http://www.debian.org/misc/bsd.license BSD - * @version SVN: $Id$ - * @link http://pear.php.net/package/Text_LanguageDetect/ - */ - -/** - * Provides a mapping between the languages from lang.dat and the - * ISO 639-1 and ISO-639-2 codes. - * - * Note that this class contains only languages that exist in lang.dat. - * - * @category Text - * @package Text_LanguageDetect - * @author Christian Weiske <cweiske@php.net> - * @copyright 2011 Christian Weiske <cweiske@php.net> - * @license http://www.debian.org/misc/bsd.license BSD - * @link http://www.loc.gov/standards/iso639-2/php/code_list.php - */ -class Text_LanguageDetect_ISO639 -{ - /** - * Maps all language names from the language database to the - * ISO 639-1 2-letter language code. - * - * NULL indicates that there is no 2-letter code. - * - * @var array - */ - public static $nameToCode2 = array( - 'albanian' => 'sq', - 'arabic' => 'ar', - 'azeri' => 'az', - 'bengali' => 'bn', - 'bulgarian' => 'bg', - 'cebuano' => null, - 'croatian' => 'hr', - 'czech' => 'cs', - 'danish' => 'da', - 'dutch' => 'nl', - 'english' => 'en', - 'estonian' => 'et', - 'farsi' => 'fa', - 'finnish' => 'fi', - 'french' => 'fr', - 'german' => 'de', - 'hausa' => 'ha', - 'hawaiian' => null, - 'hindi' => 'hi', - 'hungarian' => 'hu', - 'icelandic' => 'is', - 'indonesian' => 'id', - 'italian' => 'it', - 'kazakh' => 'kk', - 'kyrgyz' => 'ky', - 'latin' => 'la', - 'latvian' => 'lv', - 'lithuanian' => 'lt', - 'macedonian' => 'mk', - 'mongolian' => 'mn', - 'nepali' => 'ne', - 'norwegian' => 'no', - 'pashto' => 'ps', - 'pidgin' => null, - 'polish' => 'pl', - 'portuguese' => 'pt', - 'romanian' => 'ro', - 'russian' => 'ru', - 'serbian' => 'sr', - 'slovak' => 'sk', - 'slovene' => 'sl', - 'somali' => 'so', - 'spanish' => 'es', - 'swahili' => 'sw', - 'swedish' => 'sv', - 'tagalog' => 'tl', - 'turkish' => 'tr', - 'ukrainian' => 'uk', - 'urdu' => 'ur', - 'uzbek' => 'uz', - 'vietnamese' => 'vi', - 'welsh' => 'cy', - ); - - /** - * Maps all language names from the language database to the - * ISO 639-2 3-letter language code. - * - * @var array - */ - public static $nameToCode3 = array( - 'albanian' => 'sqi', - 'arabic' => 'ara', - 'azeri' => 'aze', - 'bengali' => 'ben', - 'bulgarian' => 'bul', - 'cebuano' => 'ceb', - 'croatian' => 'hrv', - 'czech' => 'ces', - 'danish' => 'dan', - 'dutch' => 'nld', - 'english' => 'eng', - 'estonian' => 'est', - 'farsi' => 'fas', - 'finnish' => 'fin', - 'french' => 'fra', - 'german' => 'deu', - 'hausa' => 'hau', - 'hawaiian' => 'haw', - 'hindi' => 'hin', - 'hungarian' => 'hun', - 'icelandic' => 'isl', - 'indonesian' => 'ind', - 'italian' => 'ita', - 'kazakh' => 'kaz', - 'kyrgyz' => 'kir', - 'latin' => 'lat', - 'latvian' => 'lav', - 'lithuanian' => 'lit', - 'macedonian' => 'mkd', - 'mongolian' => 'mon', - 'nepali' => 'nep', - 'norwegian' => 'nor', - 'pashto' => 'pus', - 'pidgin' => 'crp', - 'polish' => 'pol', - 'portuguese' => 'por', - 'romanian' => 'ron', - 'russian' => 'rus', - 'serbian' => 'srp', - 'slovak' => 'slk', - 'slovene' => 'slv', - 'somali' => 'som', - 'spanish' => 'spa', - 'swahili' => 'swa', - 'swedish' => 'swe', - 'tagalog' => 'tgl', - 'turkish' => 'tur', - 'ukrainian' => 'ukr', - 'urdu' => 'urd', - 'uzbek' => 'uzb', - 'vietnamese' => 'vie', - 'welsh' => 'cym', - ); - - /** - * Maps ISO 639-1 2-letter language codes to the language names - * in the language database - * - * Not all languages have a 2 letter code, so some are missing - * - * @var array - */ - public static $code2ToName = array( - 'ar' => 'arabic', - 'az' => 'azeri', - 'bg' => 'bulgarian', - 'bn' => 'bengali', - 'cs' => 'czech', - 'cy' => 'welsh', - 'da' => 'danish', - 'de' => 'german', - 'en' => 'english', - 'es' => 'spanish', - 'et' => 'estonian', - 'fa' => 'farsi', - 'fi' => 'finnish', - 'fr' => 'french', - 'ha' => 'hausa', - 'hi' => 'hindi', - 'hr' => 'croatian', - 'hu' => 'hungarian', - 'id' => 'indonesian', - 'is' => 'icelandic', - 'it' => 'italian', - 'kk' => 'kazakh', - 'ky' => 'kyrgyz', - 'la' => 'latin', - 'lt' => 'lithuanian', - 'lv' => 'latvian', - 'mk' => 'macedonian', - 'mn' => 'mongolian', - 'ne' => 'nepali', - 'nl' => 'dutch', - 'no' => 'norwegian', - 'pl' => 'polish', - 'ps' => 'pashto', - 'pt' => 'portuguese', - 'ro' => 'romanian', - 'ru' => 'russian', - 'sk' => 'slovak', - 'sl' => 'slovene', - 'so' => 'somali', - 'sq' => 'albanian', - 'sr' => 'serbian', - 'sv' => 'swedish', - 'sw' => 'swahili', - 'tl' => 'tagalog', - 'tr' => 'turkish', - 'uk' => 'ukrainian', - 'ur' => 'urdu', - 'uz' => 'uzbek', - 'vi' => 'vietnamese', - ); - - /** - * Maps ISO 639-2 3-letter language codes to the language names - * in the language database. - * - * @var array - */ - public static $code3ToName = array( - 'ara' => 'arabic', - 'aze' => 'azeri', - 'ben' => 'bengali', - 'bul' => 'bulgarian', - 'ceb' => 'cebuano', - 'ces' => 'czech', - 'crp' => 'pidgin', - 'cym' => 'welsh', - 'dan' => 'danish', - 'deu' => 'german', - 'eng' => 'english', - 'est' => 'estonian', - 'fas' => 'farsi', - 'fin' => 'finnish', - 'fra' => 'french', - 'hau' => 'hausa', - 'haw' => 'hawaiian', - 'hin' => 'hindi', - 'hrv' => 'croatian', - 'hun' => 'hungarian', - 'ind' => 'indonesian', - 'isl' => 'icelandic', - 'ita' => 'italian', - 'kaz' => 'kazakh', - 'kir' => 'kyrgyz', - 'lat' => 'latin', - 'lav' => 'latvian', - 'lit' => 'lithuanian', - 'mkd' => 'macedonian', - 'mon' => 'mongolian', - 'nep' => 'nepali', - 'nld' => 'dutch', - 'nor' => 'norwegian', - 'pol' => 'polish', - 'por' => 'portuguese', - 'pus' => 'pashto', - 'rom' => 'romanian', - 'rus' => 'russian', - 'slk' => 'slovak', - 'slv' => 'slovene', - 'som' => 'somali', - 'spa' => 'spanish', - 'sqi' => 'albanian', - 'srp' => 'serbian', - 'swa' => 'swahili', - 'swe' => 'swedish', - 'tgl' => 'tagalog', - 'tur' => 'turkish', - 'ukr' => 'ukrainian', - 'urd' => 'urdu', - 'uzb' => 'uzbek', - 'vie' => 'vietnamese', - ); - - /** - * Returns the 2-letter ISO 639-1 code for the given language name. - * - * @param string $lang English language name like "swedish" - * - * @return string Two-letter language code (e.g. "sv") or NULL if not found - */ - public static function nameToCode2($lang) - { - $lang = strtolower($lang); - if (!isset(self::$nameToCode2[$lang])) { - return null; - } - return self::$nameToCode2[$lang]; - } - - /** - * Returns the 3-letter ISO 639-2 code for the given language name. - * - * @param string $lang English language name like "swedish" - * - * @return string Three-letter language code (e.g. "swe") or NULL if not found - */ - public static function nameToCode3($lang) - { - $lang = strtolower($lang); - if (!isset(self::$nameToCode3[$lang])) { - return null; - } - return self::$nameToCode3[$lang]; - } - - /** - * Returns the language name for the given 2-letter ISO 639-1 code. - * - * @param string $code Two-letter language code (e.g. "sv") - * - * @return string English language name like "swedish" - */ - public static function code2ToName($code) - { - $lang = strtolower($code); - if (!isset(self::$code2ToName[$code])) { - return null; - } - return self::$code2ToName[$code]; - } - - /** - * Returns the language name for the given 3-letter ISO 639-2 code. - * - * @param string $code Three-letter language code (e.g. "swe") - * - * @return string English language name like "swedish" - */ - public static function code3ToName($code) - { - $lang = strtolower($code); - if (!isset(self::$code3ToName[$code])) { - return null; - } - return self::$code3ToName[$code]; - } -} - -?>
\ No newline at end of file diff --git a/library/langdet/Text/LanguageDetect/Parser.php b/library/langdet/Text/LanguageDetect/Parser.php deleted file mode 100644 index 1c20c2657..000000000 --- a/library/langdet/Text/LanguageDetect/Parser.php +++ /dev/null @@ -1,349 +0,0 @@ -<?php - -/** - * This class represents a text sample to be parsed. - * - * @category Text - * @package Text_LanguageDetect - * @author Nicholas Pisarro - * @copyright 2006 - * @license BSD - * @version CVS: $Id: Parser.php 322327 2012-01-15 17:55:59Z cweiske $ - * @link http://pear.php.net/package/Text_LanguageDetect/ - * @link http://langdetect.blogspot.com/ - */ - -/** - * This class represents a text sample to be parsed. - * - * This separates the analysis of a text sample from the primary LanguageDetect - * class. After a new profile has been built, the data can be retrieved using - * the accessor functions. - * - * This class is intended to be used by the Text_LanguageDetect class, not - * end-users. - * - * @category Text - * @package Text_LanguageDetect - * @author Nicholas Pisarro - * @copyright 2006 - * @license BSD - * @version release: 0.3.0 - */ -class Text_LanguageDetect_Parser extends Text_LanguageDetect -{ - /** - * the piece of text being parsed - * - * @access private - * @var string - */ - var $_string; - - /** - * stores the trigram frequencies of the sample - * - * @access private - * @var string - */ - var $_trigrams = array(); - - /** - * stores the trigram ranks of the sample - * - * @access private - * @var array - */ - var $_trigram_ranks = array(); - - /** - * stores the unicode blocks of the sample - * - * @access private - * @var array - */ - var $_unicode_blocks = array(); - - /** - * Whether the parser should compile the unicode ranges - * - * @access private - * @var bool - */ - var $_compile_unicode = false; - - /** - * Whether the parser should compile trigrams - * - * @access private - * @var bool - */ - var $_compile_trigram = false; - - /** - * Whether the trigram parser should pad the beginning of the string - * - * @access private - * @var bool - */ - var $_trigram_pad_start = false; - - /** - * Whether the unicode parser should skip non-alphabetical ascii chars - * - * @access private - * @var bool - */ - var $_unicode_skip_symbols = true; - - /** - * Constructor - * - * @access private - * @param string $string string to be parsed - */ - function Text_LanguageDetect_Parser($string) { - $this->_string = $string; - } - - /** - * Returns true if a string is suitable for parsing - * - * @param string $str input string to test - * @return bool true if acceptable, false if not - */ - public static function validateString($str) { - if (!empty($str) && strlen($str) > 3 && preg_match('/\S/', $str)) { - return true; - } else { - return false; - } - } - - /** - * turn on/off trigram counting - * - * @access public - * @param bool $bool true for on, false for off - */ - function prepareTrigram($bool = true) - { - $this->_compile_trigram = $bool; - } - - /** - * turn on/off unicode block counting - * - * @access public - * @param bool $bool true for on, false for off - */ - function prepareUnicode($bool = true) - { - $this->_compile_unicode = $bool; - } - - /** - * turn on/off padding the beginning of the sample string - * - * @access public - * @param bool $bool true for on, false for off - */ - function setPadStart($bool = true) - { - $this->_trigram_pad_start = $bool; - } - - /** - * Should the unicode block counter skip non-alphabetical ascii chars? - * - * @access public - * @param bool $bool true for on, false for off - */ - function setUnicodeSkipSymbols($bool = true) - { - $this->_unicode_skip_symbols = $bool; - } - - /** - * Returns the trigram ranks for the text sample - * - * @access public - * @return array trigram ranks in the text sample - */ - function &getTrigramRanks() - { - return $this->_trigram_ranks; - } - - /** - * Return the trigram freqency table - * - * only used in testing to make sure the parser is working - * - * @access public - * @return array trigram freqencies in the text sample - */ - function &getTrigramFreqs() - { - return $this->_trigram; - } - - /** - * returns the array of unicode blocks - * - * @access public - * @return array unicode blocks in the text sample - */ - function &getUnicodeBlocks() - { - return $this->_unicode_blocks; - } - - /** - * Executes the parsing operation - * - * Be sure to call the set*() functions to set options and the - * prepare*() functions first to tell it what kind of data to compute - * - * Afterwards the get*() functions can be used to access the compiled - * information. - * - * @access public - */ - function analyze() - { - $len = strlen($this->_string); - $byte_counter = 0; - - - // unicode startup - if ($this->_compile_unicode) { - $blocks = $this->_read_unicode_block_db(); - $block_count = count($blocks); - - $skipped_count = 0; - $unicode_chars = array(); - } - - // trigram startup - if ($this->_compile_trigram) { - // initialize them as blank so the parser will skip the first two - // (since it skips trigrams with more than 2 contiguous spaces) - $a = ' '; - $b = ' '; - - // kludge - // if it finds a valid trigram to start and the start pad option is - // off, then set a variable that will be used to reduce this - // trigram after parsing has finished - if (!$this->_trigram_pad_start) { - $a = $this->_next_char($this->_string, $byte_counter, true); - - if ($a != ' ') { - $b = $this->_next_char($this->_string, $byte_counter, true); - $dropone = " $a$b"; - } - - $byte_counter = 0; - $a = ' '; - $b = ' '; - } - } - - while ($byte_counter < $len) { - $char = $this->_next_char($this->_string, $byte_counter, true); - - - // language trigram detection - if ($this->_compile_trigram) { - if (!($b == ' ' && ($a == ' ' || $char == ' '))) { - if (!isset($this->_trigram[$a . $b . $char])) { - $this->_trigram[$a . $b . $char] = 1; - } else { - $this->_trigram[$a . $b . $char]++; - } - } - - $a = $b; - $b = $char; - } - - // unicode block detection - if ($this->_compile_unicode) { - if ($this->_unicode_skip_symbols - && strlen($char) == 1 - && ($char < 'A' || $char > 'z' - || ($char > 'Z' && $char < 'a')) - && $char != "'") { // does not skip the apostrophe - // since it's included in the language - // models - - $skipped_count++; - continue; - } - - // build an array of all the characters - if (isset($unicode_chars[$char])) { - $unicode_chars[$char]++; - } else { - $unicode_chars[$char] = 1; - } - } - - // todo: add byte detection here - } - - // unicode cleanup - if ($this->_compile_unicode) { - foreach ($unicode_chars as $utf8_char => $count) { - $search_result = $this->_unicode_block_name( - $this->_utf8char2unicode($utf8_char), $blocks, $block_count); - - if ($search_result != -1) { - $block_name = $search_result[2]; - } else { - $block_name = '[Malformatted]'; - } - - if (isset($this->_unicode_blocks[$block_name])) { - $this->_unicode_blocks[$block_name] += $count; - } else { - $this->_unicode_blocks[$block_name] = $count; - } - } - } - - - // trigram cleanup - if ($this->_compile_trigram) { - // pad the end - if ($b != ' ') { - if (!isset($this->_trigram["$a$b "])) { - $this->_trigram["$a$b "] = 1; - } else { - $this->_trigram["$a$b "]++; - } - } - - // perl compatibility; Language::Guess does not pad the beginning - // kludge - if (isset($dropone)) { - if ($this->_trigram[$dropone] == 1) { - unset($this->_trigram[$dropone]); - } else { - $this->_trigram[$dropone]--; - } - } - - if (!empty($this->_trigram)) { - $this->_trigram_ranks = $this->_arr_rank($this->_trigram); - } else { - $this->_trigram_ranks = array(); - } - } - } -} - -/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ - -?> |