aboutsummaryrefslogtreecommitdiffstats
path: root/include/language.php
diff options
context:
space:
mode:
authorMario <mario@mariovavti.com>2023-10-07 16:00:34 +0000
committerMario <mario@mariovavti.com>2023-10-07 16:00:34 +0000
commit2f2e353ecef52639a78cac3bc407ccfe64197ac9 (patch)
tree557793121ec08dce6e97c064a100994e8d3990c5 /include/language.php
parent0092b7c0a4d6cf49c092e2232af63f87be63142b (diff)
downloadvolse-hubzilla-2f2e353ecef52639a78cac3bc407ccfe64197ac9.tar.gz
volse-hubzilla-2f2e353ecef52639a78cac3bc407ccfe64197ac9.tar.bz2
volse-hubzilla-2f2e353ecef52639a78cac3bc407ccfe64197ac9.zip
use new lang detect library which supports much more languages
Diffstat (limited to 'include/language.php')
-rw-r--r--include/language.php45
1 files changed, 17 insertions, 28 deletions
diff --git a/include/language.php b/include/language.php
index d84f02a36..1b2e7332e 100644
--- a/include/language.php
+++ b/include/language.php
@@ -9,6 +9,7 @@
*/
use CommerceGuys\Intl\Language\LanguageRepository;
+use LanguageDetection\Language;
/**
* @brief Get the browser's submitted preferred languages.
@@ -299,14 +300,10 @@ function string_plural_select_default($n) {
/**
* @brief Takes a string and tries to identify the language.
*
- * It uses the pear library Text_LanguageDetect and it can identify 52 human languages.
- * It returns the identified languges and a confidence score for each.
- *
* Strings need to have a min length config['system']['language_detect_min_length']
* and you can influence the confidence that must be met before a result will get
* returned through config['system']['language_detect_min_confidence'].
*
- * @see http://pear.php.net/package/Text_LanguageDetect
* @param string $s A string to examine
* @return string Language code in 2-letter ISO 639-1 (en, de, fr) format
*/
@@ -316,43 +313,35 @@ function detect_language($s) {
return EMPTY_STR;
}
- $min_length = get_config('system', 'language_detect_min_length');
- if ($min_length === false)
- $min_length = LANGUAGE_DETECT_MIN_LENGTH;
-
- $min_confidence = get_config('system', 'language_detect_min_confidence');
- if ($min_confidence === false)
- $min_confidence = LANGUAGE_DETECT_MIN_CONFIDENCE;
+ $min_length = get_config('system', 'language_detect_min_length', LANGUAGE_DETECT_MIN_LENGTH);
+ $min_confidence = get_config('system', 'language_detect_min_confidence', LANGUAGE_DETECT_MIN_CONFIDENCE);
// embedded apps have long base64 strings which will trip up the detector.
$naked_body = preg_replace('/\[app\](.*?)\[\/app\]/', '', $s);
+
// strip off bbcode
$naked_body = preg_replace('/\[(.+?)\]/', '', $naked_body);
+
+ // strip any links
+ $naked_body = preg_replace('/\b(https?|ftp|file):\/\/[-A-Z0-9+&@#\/%?=~_|$!:,.;]*[A-Z0-9+&@#\/%=~_|$]/i', '', $naked_body);
+
if (mb_strlen($naked_body) < intval($min_length)) {
logger('string length less than ' . intval($min_length), LOGGER_DATA);
- return '';
+ return EMPTY_STR;
}
- $l = new Text_LanguageDetect;
- try {
- // return 2-letter ISO 639-1 (en) language code
- $l->setNameMode(2);
- $lng = $l->detectConfidence($naked_body);
- logger('detect language: ' . print_r($lng, true) . $naked_body, LOGGER_DATA);
- } catch (Text_LanguageDetect_Exception $e) {
- logger('detect language exception: ' . $e->getMessage(), LOGGER_DATA);
- }
+ $lang = new Language;
+ $lang_arr = $lang->detect($naked_body)->limit(0, 1)->close();
- if ((! $lng) || (! (x($lng,'language')))) {
- return '';
+ $confidence = reset($lang_arr);
+ if ($confidence >= intval($min_confidence)) {
+ logger('detect language: ' . print_r($lang_arr, true) . $naked_body, LOGGER_DATA);
+ return key($lang_arr);
}
- if ($lng['confidence'] < (float) $min_confidence) {
- logger('detect language: confidence less than ' . (float) $min_confidence, LOGGER_DATA);
- return '';
- }
+ logger('detect language: confidence less than ' . $min_confidence, LOGGER_DATA);
- return($lng['language']);
+ return EMPTY_STR;
}
/**