diff options
author | Andrew Manning <tamanning@zoho.com> | 2016-06-02 22:32:50 -0400 |
---|---|---|
committer | Andrew Manning <tamanning@zoho.com> | 2016-06-02 22:32:50 -0400 |
commit | b93e398674b375a3b14718fc6dd2a815aad9b387 (patch) | |
tree | 7c2a8097e1c90a87cc8207b5fe08a064f4fa3ae8 /library/spam/b8/lexer/lexer_default.php | |
parent | b70c6809648bb3c78e5e26f9293727b3a7aa4025 (diff) | |
parent | f9075e2a2feca0f37fdf568be6e6e53460aa9034 (diff) | |
download | volse-hubzilla-b93e398674b375a3b14718fc6dd2a815aad9b387.tar.gz volse-hubzilla-b93e398674b375a3b14718fc6dd2a815aad9b387.tar.bz2 volse-hubzilla-b93e398674b375a3b14718fc6dd2a815aad9b387.zip |
Merge remote-tracking branch 'upstream/dev' into wiki
Diffstat (limited to 'library/spam/b8/lexer/lexer_default.php')
-rw-r--r-- | library/spam/b8/lexer/lexer_default.php | 205 |
1 files changed, 0 insertions, 205 deletions
diff --git a/library/spam/b8/lexer/lexer_default.php b/library/spam/b8/lexer/lexer_default.php deleted file mode 100644 index 7b5ca22bf..000000000 --- a/library/spam/b8/lexer/lexer_default.php +++ /dev/null @@ -1,205 +0,0 @@ -<?php - -# Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de> -# -# This file is part of the b8 package -# -# This program is free software; you can redistribute it and/or modify it -# under the terms of the GNU Lesser General Public License as published by -# the Free Software Foundation in version 2.1 of the License. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -# License for more details. -# -# You should have received a copy of the GNU Lesser General Public License -# along with this program; if not, write to the Free Software Foundation, -# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -/** - * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de> - * - * @license LGPL - * @access public - * @package b8 - * @author Tobias Leupold - * @author Oliver Lillie (aka buggedcom) (original PHP 5 port) - */ - -class b8_lexer_default -{ - - const LEXER_TEXT_NOT_STRING = 'LEXER_TEXT_NOT_STRING'; - const LEXER_TEXT_EMPTY = 'LEXER_TEXT_EMPTY'; - - public $config = NULL; - - # The regular expressions we use to split the text to tokens - - public $regexp = array( - 'ip' => '/([A-Za-z0-9\_\-\.]+)/', - 'raw_split' => '/[\s,\.\/"\:;\|<>\-_\[\]{}\+=\)\(\*\&\^%]+/', - 'html' => '/(<.+?>)/', - 'tagname' => '/(.+?)\s/', - 'numbers' => '/^[0-9]+$/' - ); - - /** - * Constructs the lexer. - * - * @access public - * @return void - */ - - function __construct($config) - { - $this->config = $config; - } - - /** - * Generates the tokens required for the bayesian filter. - * - * @access public - * @param string $text - * @return array Returns the list of tokens - */ - - public function get_tokens($text) - { - - # Check that we actually have a string ... - if(is_string($text) === FALSE) - return self::LEXER_TEXT_NOT_STRING; - - # ... and that it's not empty - if(empty($text) === TRUE) - return self::LEXER_TEXT_EMPTY; - - # Re-convert the text to the original characters coded in UTF-8, as - # they have been coded in html entities during the post process - $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8'); - - $tokens = array(); - - # Find URLs and IP addresses - - preg_match_all($this->regexp['ip'], $text, $raw_tokens); - - foreach($raw_tokens[1] as $word) { - - # Check for a dot - if(strpos($word, '.') === FALSE) - continue; - - # Check that the word is valid, min and max sizes, etc. - if($this->_is_valid($word) === FALSE) - continue; - - if(isset($tokens[$word]) === FALSE) - $tokens[$word] = 1; - else - $tokens[$word] += 1; - - # Delete the word from the text so it doesn't get re-added. - $text = str_replace($word, '', $text); - - # Also process the parts of the URLs - $url_parts = preg_split($this->regexp['raw_split'], $word); - - foreach($url_parts as $word) { - - # Again validate the part - - if($this->_is_valid($word) === FALSE) - continue; - - if(isset($tokens[$word]) === FALSE) - $tokens[$word] = 1; - else - $tokens[$word] += 1; - - } - - } - - # Split the remaining text - - $raw_tokens = preg_split($this->regexp['raw_split'], $text); - - foreach($raw_tokens as $word) { - - # Again validate the part - - if($this->_is_valid($word) === FALSE) - continue; - - if(isset($tokens[$word]) === FALSE) - $tokens[$word] = 1; - else - $tokens[$word] += 1; - - } - - # Process the HTML - - preg_match_all($this->regexp['html'], $text, $raw_tokens); - - foreach($raw_tokens[1] as $word) { - - # Again validate the part - - if($this->_is_valid($word) === FALSE) - continue; - - # If the tag has parameters, just use the tag itself - - if(strpos($word, ' ') !== FALSE) { - preg_match($this->regexp['tagname'], $word, $tmp); - $word = "{$tmp[1]}...>"; - } - - if(isset($tokens[$word]) === FALSE) - $tokens[$word] = 1; - else - $tokens[$word] += 1; - - } - - # Return a list of all found tokens - return $tokens; - - } - - /** - * Validates a token. - * - * @access private - * @param string $token The token string. - * @return boolean Returns TRUE if the token is valid, otherwise returns FALSE - */ - - private function _is_valid($token) - { - - # Validate the size of the token - - $len = strlen($token); - - if($len < $this->config['min_size'] or $len > $this->config['max_size']) - return FALSE; - - # We may want to exclude pure numbers - if($this->config['allow_numbers'] === FALSE) { - if(preg_match($this->regexp['numbers'], $token) > 0) - return FALSE; - } - - # Token is okay - return TRUE; - - } - -} - -?>
\ No newline at end of file |