From 62727012d37ef3d3cacc413d5667dc2d7bbf9cbb Mon Sep 17 00:00:00 2001 From: friendica Date: Sat, 12 May 2012 01:55:18 -0700 Subject: language detection library --- library/langdet/docs/example_clui.php | 35 +++++++++++++++++ library/langdet/docs/example_web.php | 72 +++++++++++++++++++++++++++++++++++ library/langdet/docs/iso.php | 21 ++++++++++ 3 files changed, 128 insertions(+) create mode 100644 library/langdet/docs/example_clui.php create mode 100644 library/langdet/docs/example_web.php create mode 100644 library/langdet/docs/iso.php (limited to 'library/langdet/docs') diff --git a/library/langdet/docs/example_clui.php b/library/langdet/docs/example_clui.php new file mode 100644 index 000000000..8e7d8577d --- /dev/null +++ b/library/langdet/docs/example_clui.php @@ -0,0 +1,35 @@ +getLanguages(); +sort($langs); +echo join(', ', $langs); + +echo "\ntotal ", count($langs), "\n\n"; + +while ($line = fgets($stdin)) { + $result = $l->detect($line, 4); + print_r($result); + $blocks = $l->detectUnicodeBlocks($line, true); + print_r($blocks); +} + +fclose($stdin); +unset($l); + +/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ + +?> diff --git a/library/langdet/docs/example_web.php b/library/langdet/docs/example_web.php new file mode 100644 index 000000000..1e155fef2 --- /dev/null +++ b/library/langdet/docs/example_web.php @@ -0,0 +1,72 @@ + + + +Text_LanguageDetect demonstration + + +

Text_LanguageDetect

+Supported languages:\n"; +$langs = $l->getLanguages(); +sort($langs); +foreach ($langs as $lang) { + echo ucfirst($lang), ', '; + $i++; +} + +echo "
total $i

"; + +?> +
+Enter text to identify language (at least a couple of sentences):
+ +
+ +
+utf8strlen($q); + if ($len < 20) { // this value picked somewhat arbitrarily + echo "Warning: string not very long ($len chars)
\n"; + } + + $result = $l->detectConfidence($q); + + if ($result == null) { + echo "Text_LanguageDetect cannot identify this piece of text.

\n"; + } else { + echo "Text_LanguageDetect thinks this text is written in {$result['language']} ({$result['similarity']}, {$result['confidence']})

\n"; + } + + $result = $l->detectUnicodeBlocks($q, false); + if (!empty($result)) { + arsort($result); + echo "Unicode blocks present: ", join(', ', array_keys($result)), "\n

"; + } +} + +unset($l); + +/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */ + +?> + diff --git a/library/langdet/docs/iso.php b/library/langdet/docs/iso.php new file mode 100644 index 000000000..6d7ec1d2e --- /dev/null +++ b/library/langdet/docs/iso.php @@ -0,0 +1,21 @@ +setNameMode(2); +echo $l->detectSimple('Das ist ein kleiner Text') . "\n"; + +//will output the ISO 639-2 three-letter language code +// "deu" +$l->setNameMode(3); +echo $l->detectSimple('Das ist ein kleiner Text') . "\n"; + +?> \ No newline at end of file -- cgit v1.2.3