aboutsummaryrefslogtreecommitdiffstats
path: root/library/spam/b8/degenerator
diff options
context:
space:
mode:
authorfriendica <info@friendica.com>2012-01-31 15:54:41 -0800
committerfriendica <info@friendica.com>2012-01-31 15:54:41 -0800
commitc8c062d96024a23e67e73f1ebffcf7009b18eed1 (patch)
treeec8382bc177f4f8be6516ab8d2e420088d6f9733 /library/spam/b8/degenerator
parent4fc455d195fef9f1bfbc9f125788650fb3153237 (diff)
downloadvolse-hubzilla-c8c062d96024a23e67e73f1ebffcf7009b18eed1.tar.gz
volse-hubzilla-c8c062d96024a23e67e73f1ebffcf7009b18eed1.tar.bz2
volse-hubzilla-c8c062d96024a23e67e73f1ebffcf7009b18eed1.zip
add spam engine
Diffstat (limited to 'library/spam/b8/degenerator')
-rw-r--r--library/spam/b8/degenerator/degenerator_default.php127
1 files changed, 127 insertions, 0 deletions
diff --git a/library/spam/b8/degenerator/degenerator_default.php b/library/spam/b8/degenerator/degenerator_default.php
new file mode 100644
index 000000000..4ff6d882b
--- /dev/null
+++ b/library/spam/b8/degenerator/degenerator_default.php
@@ -0,0 +1,127 @@
+<?php
+
+# Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
+#
+# This file is part of the b8 package
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation in version 2.1 of the License.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program; if not, write to the Free Software Foundation,
+# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+
+/**
+ * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
+ *
+ * @license LGPL
+ * @access public
+ * @package b8
+ * @author Tobias Leupold
+ */
+
+class b8_degenerator_default
+{
+
+ public $degenerates = array();
+
+ /**
+ * Generates a list of "degenerated" words for a list of words.
+ *
+ * @access public
+ * @param array $tokens
+ * @return array An array containing an array of degenerated tokens for each token
+ */
+
+ public function degenerate(array $words)
+ {
+
+ $degenerates = array();
+
+ foreach($words as $word)
+ $degenerates[$word] = $this->_degenerate_word($word);
+
+ return $degenerates;
+
+ }
+
+ /**
+ * If the original word is not found in the database then
+ * we build "degenerated" versions of the word to lookup.
+ *
+ * @access private
+ * @param string $word
+ * @return array An array of degenerated words
+ */
+
+ protected function _degenerate_word($word)
+ {
+
+ # Check for any stored words so the process doesn't have to repeat
+ if(isset($this->degenerates[$word]) === TRUE)
+ return $this->degenerates[$word];
+
+ $degenerate = array();
+
+ # Add different version of upper and lower case and ucfirst
+ array_push($degenerate, strtolower($word));
+ array_push($degenerate, strtoupper($word));
+ array_push($degenerate, ucfirst($word));
+
+ # Degenerate all versions
+
+ foreach($degenerate as $alt_word) {
+
+ # Look for stuff like !!! and ???
+
+ if(preg_match('/[!?]$/', $alt_word) > 0) {
+
+ # Add versions with different !s and ?s
+
+ if(preg_match('/[!?]{2,}$/', $alt_word) > 0) {
+ $tmp = preg_replace('/([!?])+$/', '$1', $alt_word);
+ array_push($degenerate, $tmp);
+ }
+
+ $tmp = preg_replace('/([!?])+$/', '', $alt_word);
+ array_push($degenerate, $tmp);
+
+ }
+
+ # Look for ... at the end of the word
+
+ $alt_word_int = $alt_word;
+
+ while(preg_match('/[\.]$/', $alt_word_int) > 0) {
+ $alt_word_int = substr($alt_word_int, 0, strlen($alt_word_int) - 1);
+ array_push($degenerate, $alt_word_int);
+ }
+
+ }
+
+ # Some degenerates are the same as the original word. These don't have
+ # to be fetched, so we create a new array with only new tokens
+
+ $real_degenerate = array();
+
+ foreach($degenerate as $deg_word) {
+ if($word != $deg_word)
+ array_push($real_degenerate, $deg_word);
+ }
+
+ # Store the list of degenerates for the token
+ $this->degenerates[$word] = $real_degenerate;
+
+ return $real_degenerate;
+
+ }
+
+}
+
+?> \ No newline at end of file