diff options
Diffstat (limited to 'include/language.php')
-rw-r--r-- | include/language.php | 92 |
1 files changed, 92 insertions, 0 deletions
diff --git a/include/language.php b/include/language.php new file mode 100644 index 000000000..aa7fce04c --- /dev/null +++ b/include/language.php @@ -0,0 +1,92 @@ +<?php + + +function detect_language($s) { + + $detected_languages = array( + 'Albanian' => 'sq', + 'Arabic' => 'ar', + 'Azeri' => 'az', + 'Bengali' => 'bn', + 'Bulgarian' => 'bg', + 'Cebuano' => '', + 'Croatian' => 'hr', + 'Czech' => 'cz', + 'Danish' => 'da', + 'Dutch' => 'nl', + 'English' => 'en', + 'Estonian' => 'et', + 'Farsi' => 'fa', + 'Finnish' => 'fi', + 'French' => 'fr', + 'German' => 'de', + 'Hausa' => 'ha', + 'Hawaiian' => '', + 'Hindi' => 'hi', + 'Hungarian' => 'hu', + 'Icelandic' => 'is', + 'Indonesian' => 'id', + 'Italian' => 'it', + 'Kazakh' => 'kk', + 'Kyrgyz' => 'ky', + 'Latin' => 'la', + 'Latvian' => 'lv', + 'Lithuanian' => 'lt', + 'Macedonian' => 'mk', + 'Mongolian' => 'mn', + 'Nepali' => 'ne', + 'Norwegian' => 'no', + 'Pashto' => 'ps', + 'Pidgin' => '', + 'Polish' => 'pl', + 'Portuguese' => 'pt', + 'Romanian' => 'ro', + 'Russian' => 'ru', + 'Serbian' => 'sr', + 'Slovak' => 'sk', + 'Slovene' => 'sl', + 'Somali' => 'so', + 'Spanish' => 'es', + 'Swahili' => 'sw', + 'Swedish' => 'sv', + 'Tagalog' => 'tl', + 'Turkish' => 'tr', + 'Ukrainian' => 'uk', + 'Urdu' => 'ur', + 'Uzbek' => 'uz', + 'Vietnamese' => 'vi', + 'Welsh' => 'cy' + ); + + require_once('Text/LanguageDetect.php'); + + $min_length = get_config('system','language_detect_min_length'); + if($min_length === false) + $min_length = LANGUAGE_DETECT_MIN_LENGTH; + + $min_confidence = get_config('system','language_detect_min_confidence'); + if($min_confidence === false) + $min_confidence = LANGUAGE_DETECT_MIN_CONFIDENCE; + + + $naked_body = preg_replace('/\[(.+?)\]/','',$s); + if(mb_strlen($naked_body) < intval($min_length)) + return ''; + + $l = new Text_LanguageDetect; + $lng = $l->detectConfidence($naked_body); + + logger('detect language: ' . print_r($lng,true) . $naked_body, LOGGER_DATA); + + if((! $lng) || (! (x($lng,'language')))) { + return ''; + } + + if($lng['confidence'] < (float) $min_confidence) { + logger('detect language: confidence less than ' . (float) $min_confidence, LOGGER_DATA); + return ''; + } + + return(($lng && (x($lng,'language'))) ? $detected_languages[ucfirst($lng['language'])] : ''); + +} |