diff options
Diffstat (limited to 'library/spam/b8/lexer/lexer_default.php')
-rw-r--r-- | library/spam/b8/lexer/lexer_default.php | 205 |
1 files changed, 205 insertions, 0 deletions
diff --git a/library/spam/b8/lexer/lexer_default.php b/library/spam/b8/lexer/lexer_default.php new file mode 100644 index 000000000..7b5ca22bf --- /dev/null +++ b/library/spam/b8/lexer/lexer_default.php @@ -0,0 +1,205 @@ +<?php + +# Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de> +# +# This file is part of the b8 package +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation in version 2.1 of the License. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +/** + * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de> + * + * @license LGPL + * @access public + * @package b8 + * @author Tobias Leupold + * @author Oliver Lillie (aka buggedcom) (original PHP 5 port) + */ + +class b8_lexer_default +{ + + const LEXER_TEXT_NOT_STRING = 'LEXER_TEXT_NOT_STRING'; + const LEXER_TEXT_EMPTY = 'LEXER_TEXT_EMPTY'; + + public $config = NULL; + + # The regular expressions we use to split the text to tokens + + public $regexp = array( + 'ip' => '/([A-Za-z0-9\_\-\.]+)/', + 'raw_split' => '/[\s,\.\/"\:;\|<>\-_\[\]{}\+=\)\(\*\&\^%]+/', + 'html' => '/(<.+?>)/', + 'tagname' => '/(.+?)\s/', + 'numbers' => '/^[0-9]+$/' + ); + + /** + * Constructs the lexer. + * + * @access public + * @return void + */ + + function __construct($config) + { + $this->config = $config; + } + + /** + * Generates the tokens required for the bayesian filter. + * + * @access public + * @param string $text + * @return array Returns the list of tokens + */ + + public function get_tokens($text) + { + + # Check that we actually have a string ... + if(is_string($text) === FALSE) + return self::LEXER_TEXT_NOT_STRING; + + # ... and that it's not empty + if(empty($text) === TRUE) + return self::LEXER_TEXT_EMPTY; + + # Re-convert the text to the original characters coded in UTF-8, as + # they have been coded in html entities during the post process + $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8'); + + $tokens = array(); + + # Find URLs and IP addresses + + preg_match_all($this->regexp['ip'], $text, $raw_tokens); + + foreach($raw_tokens[1] as $word) { + + # Check for a dot + if(strpos($word, '.') === FALSE) + continue; + + # Check that the word is valid, min and max sizes, etc. + if($this->_is_valid($word) === FALSE) + continue; + + if(isset($tokens[$word]) === FALSE) + $tokens[$word] = 1; + else + $tokens[$word] += 1; + + # Delete the word from the text so it doesn't get re-added. + $text = str_replace($word, '', $text); + + # Also process the parts of the URLs + $url_parts = preg_split($this->regexp['raw_split'], $word); + + foreach($url_parts as $word) { + + # Again validate the part + + if($this->_is_valid($word) === FALSE) + continue; + + if(isset($tokens[$word]) === FALSE) + $tokens[$word] = 1; + else + $tokens[$word] += 1; + + } + + } + + # Split the remaining text + + $raw_tokens = preg_split($this->regexp['raw_split'], $text); + + foreach($raw_tokens as $word) { + + # Again validate the part + + if($this->_is_valid($word) === FALSE) + continue; + + if(isset($tokens[$word]) === FALSE) + $tokens[$word] = 1; + else + $tokens[$word] += 1; + + } + + # Process the HTML + + preg_match_all($this->regexp['html'], $text, $raw_tokens); + + foreach($raw_tokens[1] as $word) { + + # Again validate the part + + if($this->_is_valid($word) === FALSE) + continue; + + # If the tag has parameters, just use the tag itself + + if(strpos($word, ' ') !== FALSE) { + preg_match($this->regexp['tagname'], $word, $tmp); + $word = "{$tmp[1]}...>"; + } + + if(isset($tokens[$word]) === FALSE) + $tokens[$word] = 1; + else + $tokens[$word] += 1; + + } + + # Return a list of all found tokens + return $tokens; + + } + + /** + * Validates a token. + * + * @access private + * @param string $token The token string. + * @return boolean Returns TRUE if the token is valid, otherwise returns FALSE + */ + + private function _is_valid($token) + { + + # Validate the size of the token + + $len = strlen($token); + + if($len < $this->config['min_size'] or $len > $this->config['max_size']) + return FALSE; + + # We may want to exclude pure numbers + if($this->config['allow_numbers'] === FALSE) { + if(preg_match($this->regexp['numbers'], $token) > 0) + return FALSE; + } + + # Token is okay + return TRUE; + + } + +} + +?>
\ No newline at end of file |