Merge branch 'dev' into sabre32

author: Mario Vavti <mario@mariovavti.com> 2016-05-31 10:59:11 +0200
committer: Mario Vavti <mario@mariovavti.com> 2016-05-31 10:59:11 +0200
commit: 316fee93f7f7df92db514de961b74a71556fdb63 (patch)
tree: 896fd1d53ccb3863380653d1b065c7fc4ef7a641 /library/spam/b8/lexer/lexer_default.php
parent: 1523e116b91824394244c4f76a9e105a3cecd5b6 (diff)
parent: 670e83b30050201e3ac069c9dfa86a92aff2431d (diff)
download: volse-hubzilla-316fee93f7f7df92db514de961b74a71556fdb63.tar.gz
volse-hubzilla-316fee93f7f7df92db514de961b74a71556fdb63.tar.bz2
volse-hubzilla-316fee93f7f7df92db514de961b74a71556fdb63.zip
1 files changed, 0 insertions, 205 deletions
diff --git a/library/spam/b8/lexer/lexer_default.php b/library/spam/b8/lexer/lexer_default.php
deleted file mode 100644
index 7b5ca22bf..000000000
--- a/library/spam/b8/lexer/lexer_default.php
+++ /dev/null
@@ -1,205 +0,0 @@
-<?php
-
-#   Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
-#
-#   This file is part of the b8 package
-#
-#   This program is free software; you can redistribute it and/or modify it
-#   under the terms of the GNU Lesser General Public License as published by
-#   the Free Software Foundation in version 2.1 of the License.
-#
-#   This program is distributed in the hope that it will be useful, but
-#   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-#   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-#   License for more details.
-#
-#   You should have received a copy of the GNU Lesser General Public License
-#   along with this program; if not, write to the Free Software Foundation,
-#   Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
-
-/**
- * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
- *
- * @license LGPL
- * @access public
- * @package b8
- * @author Tobias Leupold
- * @author Oliver Lillie (aka buggedcom) (original PHP 5 port)
- */
-
-class b8_lexer_default
-{
-
-	const LEXER_TEXT_NOT_STRING = 'LEXER_TEXT_NOT_STRING';
-	const LEXER_TEXT_EMPTY      = 'LEXER_TEXT_EMPTY';
-
-	public $config = NULL;
-
-	# The regular expressions we use to split the text to tokens
-
-	public $regexp = array(
-		'ip'        => '/([A-Za-z0-9\_\-\.]+)/',
-		'raw_split' => '/[\s,\.\/"\:;\|<>\-_\[\]{}\+=\)\(\*\&\^%]+/',
-		'html'      => '/(<.+?>)/',
-		'tagname'   => '/(.+?)\s/',
-		'numbers'   => '/^[0-9]+$/'
-	);
-
-	/**
-	 * Constructs the lexer.
-	 *
-	 * @access public
-	 * @return void
-	 */
-
-	function __construct($config)
-	{
-		$this->config = $config;
-	}
-
-	/**
-	 * Generates the tokens required for the bayesian filter.
-	 *
-	 * @access public
-	 * @param string $text
-	 * @return array Returns the list of tokens
-	 */
-
-	public function get_tokens($text)
-	{
-
-		# Check that we actually have a string ...
-		if(is_string($text) === FALSE)
-			return self::LEXER_TEXT_NOT_STRING;
-
-		# ... and that it's not empty
-		if(empty($text) === TRUE)
-			return self::LEXER_TEXT_EMPTY;
-
-		# Re-convert the text to the original characters coded in UTF-8, as
-		# they have been coded in html entities during the post process
-		$text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
-
-		$tokens = array();
-
-		# Find URLs and IP addresses
-
-		preg_match_all($this->regexp['ip'], $text, $raw_tokens);
-
-		foreach($raw_tokens[1] as $word) {
-
-			# Check for a dot
-			if(strpos($word, '.') === FALSE)
-				continue;
-
-			# Check that the word is valid, min and max sizes, etc.
-			if($this->_is_valid($word) === FALSE)
-				continue;
-
-			if(isset($tokens[$word]) === FALSE)
-				$tokens[$word] = 1;
-			else
-				$tokens[$word] += 1;
-
-			# Delete the word from the text so it doesn't get re-added.
-			$text = str_replace($word, '', $text);
-
-			# Also process the parts of the URLs
-			$url_parts = preg_split($this->regexp['raw_split'], $word);
-
-			foreach($url_parts as $word) {
-
-				# Again validate the part
-
-				if($this->_is_valid($word) === FALSE)
-					continue;
-
-				if(isset($tokens[$word]) === FALSE)
-					$tokens[$word] = 1;
-				else
-					$tokens[$word] += 1;
-
-			}
-
-		}
-
-		# Split the remaining text
-
-		$raw_tokens = preg_split($this->regexp['raw_split'], $text);
-
-		foreach($raw_tokens as $word) {
-
-			# Again validate the part
-
-			if($this->_is_valid($word) === FALSE)
-				continue;
-
-			if(isset($tokens[$word]) === FALSE)
-				$tokens[$word] = 1;
-			else
-				$tokens[$word] += 1;
-
-		}
-
-		# Process the HTML
-
-		preg_match_all($this->regexp['html'], $text, $raw_tokens);
-
-		foreach($raw_tokens[1] as $word) {
-
-			# Again validate the part
-
-			if($this->_is_valid($word) === FALSE)
-				continue;
-
-			# If the tag has parameters, just use the tag itself
-
-			if(strpos($word, ' ') !== FALSE) {
-				preg_match($this->regexp['tagname'], $word, $tmp);
-				$word = "{$tmp[1]}...>";
-			}
-
-			if(isset($tokens[$word]) === FALSE)
-				$tokens[$word] = 1;
-			else
-				$tokens[$word] += 1;
-
-		}
-
-		# Return a list of all found tokens
-		return $tokens;
-
-	}
-
-	/**
-	 * Validates a token.
-	 *
-	 * @access private
-	 * @param string $token The token string.
-	 * @return boolean Returns TRUE if the token is valid, otherwise returns FALSE
-	 */
-
-	private function _is_valid($token)
-	{
-
-		# Validate the size of the token
-
-		$len = strlen($token);
-
-		if($len < $this->config['min_size'] or $len > $this->config['max_size'])
-			return FALSE;
-
-		# We may want to exclude pure numbers
-		if($this->config['allow_numbers'] === FALSE) {
-			if(preg_match($this->regexp['numbers'], $token) > 0)
-				return FALSE;
-		}
-
-		# Token is okay
-		return TRUE;
-
-	}
-
-}
-
-?>
-\ No newline at end of file
author	Mario Vavti <mario@mariovavti.com>	2016-05-31 10:59:11 +0200
committer	Mario Vavti <mario@mariovavti.com>	2016-05-31 10:59:11 +0200
commit	316fee93f7f7df92db514de961b74a71556fdb63 (patch)
tree	896fd1d53ccb3863380653d1b065c7fc4ef7a641 /library/spam/b8/lexer/lexer_default.php
parent	1523e116b91824394244c4f76a9e105a3cecd5b6 (diff)
parent	670e83b30050201e3ac069c9dfa86a92aff2431d (diff)
download	volse-hubzilla-316fee93f7f7df92db514de961b74a71556fdb63.tar.gz volse-hubzilla-316fee93f7f7df92db514de961b74a71556fdb63.tar.bz2 volse-hubzilla-316fee93f7f7df92db514de961b74a71556fdb63.zip