aboutsummaryrefslogtreecommitdiffstats
path: root/library/spam/b8/lexer
diff options
context:
space:
mode:
authorfriendica <info@friendica.com>2012-01-31 15:54:41 -0800
committerfriendica <info@friendica.com>2012-01-31 15:54:41 -0800
commitc8c062d96024a23e67e73f1ebffcf7009b18eed1 (patch)
treeec8382bc177f4f8be6516ab8d2e420088d6f9733 /library/spam/b8/lexer
parent4fc455d195fef9f1bfbc9f125788650fb3153237 (diff)
downloadvolse-hubzilla-c8c062d96024a23e67e73f1ebffcf7009b18eed1.tar.gz
volse-hubzilla-c8c062d96024a23e67e73f1ebffcf7009b18eed1.tar.bz2
volse-hubzilla-c8c062d96024a23e67e73f1ebffcf7009b18eed1.zip
add spam engine
Diffstat (limited to 'library/spam/b8/lexer')
-rw-r--r--library/spam/b8/lexer/lexer_default.php205
1 files changed, 205 insertions, 0 deletions
diff --git a/library/spam/b8/lexer/lexer_default.php b/library/spam/b8/lexer/lexer_default.php
new file mode 100644
index 000000000..7b5ca22bf
--- /dev/null
+++ b/library/spam/b8/lexer/lexer_default.php
@@ -0,0 +1,205 @@
+<?php
+
+# Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
+#
+# This file is part of the b8 package
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation in version 2.1 of the License.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+# License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program; if not, write to the Free Software Foundation,
+# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+
+/**
+ * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
+ *
+ * @license LGPL
+ * @access public
+ * @package b8
+ * @author Tobias Leupold
+ * @author Oliver Lillie (aka buggedcom) (original PHP 5 port)
+ */
+
+class b8_lexer_default
+{
+
+ const LEXER_TEXT_NOT_STRING = 'LEXER_TEXT_NOT_STRING';
+ const LEXER_TEXT_EMPTY = 'LEXER_TEXT_EMPTY';
+
+ public $config = NULL;
+
+ # The regular expressions we use to split the text to tokens
+
+ public $regexp = array(
+ 'ip' => '/([A-Za-z0-9\_\-\.]+)/',
+ 'raw_split' => '/[\s,\.\/"\:;\|<>\-_\[\]{}\+=\)\(\*\&\^%]+/',
+ 'html' => '/(<.+?>)/',
+ 'tagname' => '/(.+?)\s/',
+ 'numbers' => '/^[0-9]+$/'
+ );
+
+ /**
+ * Constructs the lexer.
+ *
+ * @access public
+ * @return void
+ */
+
+ function __construct($config)
+ {
+ $this->config = $config;
+ }
+
+ /**
+ * Generates the tokens required for the bayesian filter.
+ *
+ * @access public
+ * @param string $text
+ * @return array Returns the list of tokens
+ */
+
+ public function get_tokens($text)
+ {
+
+ # Check that we actually have a string ...
+ if(is_string($text) === FALSE)
+ return self::LEXER_TEXT_NOT_STRING;
+
+ # ... and that it's not empty
+ if(empty($text) === TRUE)
+ return self::LEXER_TEXT_EMPTY;
+
+ # Re-convert the text to the original characters coded in UTF-8, as
+ # they have been coded in html entities during the post process
+ $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
+
+ $tokens = array();
+
+ # Find URLs and IP addresses
+
+ preg_match_all($this->regexp['ip'], $text, $raw_tokens);
+
+ foreach($raw_tokens[1] as $word) {
+
+ # Check for a dot
+ if(strpos($word, '.') === FALSE)
+ continue;
+
+ # Check that the word is valid, min and max sizes, etc.
+ if($this->_is_valid($word) === FALSE)
+ continue;
+
+ if(isset($tokens[$word]) === FALSE)
+ $tokens[$word] = 1;
+ else
+ $tokens[$word] += 1;
+
+ # Delete the word from the text so it doesn't get re-added.
+ $text = str_replace($word, '', $text);
+
+ # Also process the parts of the URLs
+ $url_parts = preg_split($this->regexp['raw_split'], $word);
+
+ foreach($url_parts as $word) {
+
+ # Again validate the part
+
+ if($this->_is_valid($word) === FALSE)
+ continue;
+
+ if(isset($tokens[$word]) === FALSE)
+ $tokens[$word] = 1;
+ else
+ $tokens[$word] += 1;
+
+ }
+
+ }
+
+ # Split the remaining text
+
+ $raw_tokens = preg_split($this->regexp['raw_split'], $text);
+
+ foreach($raw_tokens as $word) {
+
+ # Again validate the part
+
+ if($this->_is_valid($word) === FALSE)
+ continue;
+
+ if(isset($tokens[$word]) === FALSE)
+ $tokens[$word] = 1;
+ else
+ $tokens[$word] += 1;
+
+ }
+
+ # Process the HTML
+
+ preg_match_all($this->regexp['html'], $text, $raw_tokens);
+
+ foreach($raw_tokens[1] as $word) {
+
+ # Again validate the part
+
+ if($this->_is_valid($word) === FALSE)
+ continue;
+
+ # If the tag has parameters, just use the tag itself
+
+ if(strpos($word, ' ') !== FALSE) {
+ preg_match($this->regexp['tagname'], $word, $tmp);
+ $word = "{$tmp[1]}...>";
+ }
+
+ if(isset($tokens[$word]) === FALSE)
+ $tokens[$word] = 1;
+ else
+ $tokens[$word] += 1;
+
+ }
+
+ # Return a list of all found tokens
+ return $tokens;
+
+ }
+
+ /**
+ * Validates a token.
+ *
+ * @access private
+ * @param string $token The token string.
+ * @return boolean Returns TRUE if the token is valid, otherwise returns FALSE
+ */
+
+ private function _is_valid($token)
+ {
+
+ # Validate the size of the token
+
+ $len = strlen($token);
+
+ if($len < $this->config['min_size'] or $len > $this->config['max_size'])
+ return FALSE;
+
+ # We may want to exclude pure numbers
+ if($this->config['allow_numbers'] === FALSE) {
+ if(preg_match($this->regexp['numbers'], $token) > 0)
+ return FALSE;
+ }
+
+ # Token is okay
+ return TRUE;
+
+ }
+
+}
+
+?> \ No newline at end of file