From 2f2e353ecef52639a78cac3bc407ccfe64197ac9 Mon Sep 17 00:00:00 2001 From: Mario Date: Sat, 7 Oct 2023 16:00:34 +0000 Subject: use new lang detect library which supports much more languages --- include/help.php | 9 +++++---- include/language.php | 45 +++++++++++++++++---------------------------- 2 files changed, 22 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/help.php b/include/help.php index 8a913578a..9e4be57f9 100644 --- a/include/help.php +++ b/include/help.php @@ -1,6 +1,7 @@ setNameMode(2); - if($lang_detect->languageExists(argv(1))) { + $language_repository = new LanguageRepository; + $languages = $language_repository->getList(); + + if(array_key_exists(argv(1), $languages)) { $lang = argv(1); $from_url = true; } else { diff --git a/include/language.php b/include/language.php index d84f02a36..1b2e7332e 100644 --- a/include/language.php +++ b/include/language.php @@ -9,6 +9,7 @@ */ use CommerceGuys\Intl\Language\LanguageRepository; +use LanguageDetection\Language; /** * @brief Get the browser's submitted preferred languages. @@ -299,14 +300,10 @@ function string_plural_select_default($n) { /** * @brief Takes a string and tries to identify the language. * - * It uses the pear library Text_LanguageDetect and it can identify 52 human languages. - * It returns the identified languges and a confidence score for each. - * * Strings need to have a min length config['system']['language_detect_min_length'] * and you can influence the confidence that must be met before a result will get * returned through config['system']['language_detect_min_confidence']. * - * @see http://pear.php.net/package/Text_LanguageDetect * @param string $s A string to examine * @return string Language code in 2-letter ISO 639-1 (en, de, fr) format */ @@ -316,43 +313,35 @@ function detect_language($s) { return EMPTY_STR; } - $min_length = get_config('system', 'language_detect_min_length'); - if ($min_length === false) - $min_length = LANGUAGE_DETECT_MIN_LENGTH; - - $min_confidence = get_config('system', 'language_detect_min_confidence'); - if ($min_confidence === false) - $min_confidence = LANGUAGE_DETECT_MIN_CONFIDENCE; + $min_length = get_config('system', 'language_detect_min_length', LANGUAGE_DETECT_MIN_LENGTH); + $min_confidence = get_config('system', 'language_detect_min_confidence', LANGUAGE_DETECT_MIN_CONFIDENCE); // embedded apps have long base64 strings which will trip up the detector. $naked_body = preg_replace('/\[app\](.*?)\[\/app\]/', '', $s); + // strip off bbcode $naked_body = preg_replace('/\[(.+?)\]/', '', $naked_body); + + // strip any links + $naked_body = preg_replace('/\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|$!:,.;]*[A-Z0-9+&@#\/%=~_|$]/i', '', $naked_body); + if (mb_strlen($naked_body) < intval($min_length)) { logger('string length less than ' . intval($min_length), LOGGER_DATA); - return ''; + return EMPTY_STR; } - $l = new Text_LanguageDetect; - try { - // return 2-letter ISO 639-1 (en) language code - $l->setNameMode(2); - $lng = $l->detectConfidence($naked_body); - logger('detect language: ' . print_r($lng, true) . $naked_body, LOGGER_DATA); - } catch (Text_LanguageDetect_Exception $e) { - logger('detect language exception: ' . $e->getMessage(), LOGGER_DATA); - } + $lang = new Language; + $lang_arr = $lang->detect($naked_body)->limit(0, 1)->close(); - if ((! $lng) || (! (x($lng,'language')))) { - return ''; + $confidence = reset($lang_arr); + if ($confidence >= intval($min_confidence)) { + logger('detect language: ' . print_r($lang_arr, true) . $naked_body, LOGGER_DATA); + return key($lang_arr); } - if ($lng['confidence'] < (float) $min_confidence) { - logger('detect language: confidence less than ' . (float) $min_confidence, LOGGER_DATA); - return ''; - } + logger('detect language: confidence less than ' . $min_confidence, LOGGER_DATA); - return($lng['language']); + return EMPTY_STR; } /** -- cgit v1.2.3