aboutsummaryrefslogtreecommitdiffstats
path: root/library/spam/b8/lexer/lexer_default.php
diff options
context:
space:
mode:
authorMario Vavti <mario@mariovavti.com>2016-05-31 10:59:11 +0200
committerMario Vavti <mario@mariovavti.com>2016-05-31 10:59:11 +0200
commit316fee93f7f7df92db514de961b74a71556fdb63 (patch)
tree896fd1d53ccb3863380653d1b065c7fc4ef7a641 /library/spam/b8/lexer/lexer_default.php
parent1523e116b91824394244c4f76a9e105a3cecd5b6 (diff)
parent670e83b30050201e3ac069c9dfa86a92aff2431d (diff)
downloadvolse-hubzilla-316fee93f7f7df92db514de961b74a71556fdb63.tar.gz
volse-hubzilla-316fee93f7f7df92db514de961b74a71556fdb63.tar.bz2
volse-hubzilla-316fee93f7f7df92db514de961b74a71556fdb63.zip
Merge branch 'dev' into sabre32
Diffstat (limited to 'library/spam/b8/lexer/lexer_default.php')
-rw-r--r--library/spam/b8/lexer/lexer_default.php205
1 files changed, 0 insertions, 205 deletions
diff --git a/library/spam/b8/lexer/lexer_default.php b/library/spam/b8/lexer/lexer_default.php
deleted file mode 100644
index 7b5ca22bf..000000000
--- a/library/spam/b8/lexer/lexer_default.php
+++ /dev/null
@@ -1,205 +0,0 @@
-<?php
-
-# Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
-#
-# This file is part of the b8 package
-#
-# This program is free software; you can redistribute it and/or modify it
-# under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation in version 2.1 of the License.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-# License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public License
-# along with this program; if not, write to the Free Software Foundation,
-# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
-
-/**
- * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de>
- *
- * @license LGPL
- * @access public
- * @package b8
- * @author Tobias Leupold
- * @author Oliver Lillie (aka buggedcom) (original PHP 5 port)
- */
-
-class b8_lexer_default
-{
-
- const LEXER_TEXT_NOT_STRING = 'LEXER_TEXT_NOT_STRING';
- const LEXER_TEXT_EMPTY = 'LEXER_TEXT_EMPTY';
-
- public $config = NULL;
-
- # The regular expressions we use to split the text to tokens
-
- public $regexp = array(
- 'ip' => '/([A-Za-z0-9\_\-\.]+)/',
- 'raw_split' => '/[\s,\.\/"\:;\|<>\-_\[\]{}\+=\)\(\*\&\^%]+/',
- 'html' => '/(<.+?>)/',
- 'tagname' => '/(.+?)\s/',
- 'numbers' => '/^[0-9]+$/'
- );
-
- /**
- * Constructs the lexer.
- *
- * @access public
- * @return void
- */
-
- function __construct($config)
- {
- $this->config = $config;
- }
-
- /**
- * Generates the tokens required for the bayesian filter.
- *
- * @access public
- * @param string $text
- * @return array Returns the list of tokens
- */
-
- public function get_tokens($text)
- {
-
- # Check that we actually have a string ...
- if(is_string($text) === FALSE)
- return self::LEXER_TEXT_NOT_STRING;
-
- # ... and that it's not empty
- if(empty($text) === TRUE)
- return self::LEXER_TEXT_EMPTY;
-
- # Re-convert the text to the original characters coded in UTF-8, as
- # they have been coded in html entities during the post process
- $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
-
- $tokens = array();
-
- # Find URLs and IP addresses
-
- preg_match_all($this->regexp['ip'], $text, $raw_tokens);
-
- foreach($raw_tokens[1] as $word) {
-
- # Check for a dot
- if(strpos($word, '.') === FALSE)
- continue;
-
- # Check that the word is valid, min and max sizes, etc.
- if($this->_is_valid($word) === FALSE)
- continue;
-
- if(isset($tokens[$word]) === FALSE)
- $tokens[$word] = 1;
- else
- $tokens[$word] += 1;
-
- # Delete the word from the text so it doesn't get re-added.
- $text = str_replace($word, '', $text);
-
- # Also process the parts of the URLs
- $url_parts = preg_split($this->regexp['raw_split'], $word);
-
- foreach($url_parts as $word) {
-
- # Again validate the part
-
- if($this->_is_valid($word) === FALSE)
- continue;
-
- if(isset($tokens[$word]) === FALSE)
- $tokens[$word] = 1;
- else
- $tokens[$word] += 1;
-
- }
-
- }
-
- # Split the remaining text
-
- $raw_tokens = preg_split($this->regexp['raw_split'], $text);
-
- foreach($raw_tokens as $word) {
-
- # Again validate the part
-
- if($this->_is_valid($word) === FALSE)
- continue;
-
- if(isset($tokens[$word]) === FALSE)
- $tokens[$word] = 1;
- else
- $tokens[$word] += 1;
-
- }
-
- # Process the HTML
-
- preg_match_all($this->regexp['html'], $text, $raw_tokens);
-
- foreach($raw_tokens[1] as $word) {
-
- # Again validate the part
-
- if($this->_is_valid($word) === FALSE)
- continue;
-
- # If the tag has parameters, just use the tag itself
-
- if(strpos($word, ' ') !== FALSE) {
- preg_match($this->regexp['tagname'], $word, $tmp);
- $word = "{$tmp[1]}...>";
- }
-
- if(isset($tokens[$word]) === FALSE)
- $tokens[$word] = 1;
- else
- $tokens[$word] += 1;
-
- }
-
- # Return a list of all found tokens
- return $tokens;
-
- }
-
- /**
- * Validates a token.
- *
- * @access private
- * @param string $token The token string.
- * @return boolean Returns TRUE if the token is valid, otherwise returns FALSE
- */
-
- private function _is_valid($token)
- {
-
- # Validate the size of the token
-
- $len = strlen($token);
-
- if($len < $this->config['min_size'] or $len > $this->config['max_size'])
- return FALSE;
-
- # We may want to exclude pure numbers
- if($this->config['allow_numbers'] === FALSE) {
- if(preg_match($this->regexp['numbers'], $token) > 0)
- return FALSE;
- }
-
- # Token is okay
- return TRUE;
-
- }
-
-}
-
-?> \ No newline at end of file