diff options
Diffstat (limited to 'library/spam/b8')
-rw-r--r-- | library/spam/b8/b8.php | 503 | ||||
-rw-r--r-- | library/spam/b8/degenerator/degenerator_default.php | 127 | ||||
-rw-r--r-- | library/spam/b8/lexer/lexer_default.php | 205 | ||||
-rw-r--r-- | library/spam/b8/storage/storage_base.php | 395 | ||||
-rw-r--r-- | library/spam/b8/storage/storage_dba.php | 198 | ||||
-rw-r--r-- | library/spam/b8/storage/storage_mysql.php | 351 |
6 files changed, 1779 insertions, 0 deletions
diff --git a/library/spam/b8/b8.php b/library/spam/b8/b8.php new file mode 100644 index 000000000..ea1e15ffa --- /dev/null +++ b/library/spam/b8/b8.php @@ -0,0 +1,503 @@ +<?php + +# Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de> +# +# b8 - A Bayesian spam filter written in PHP 5 +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation in version 2.1 of the License. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +/** + * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de> + * + * @license LGPL + * @access public + * @package b8 + * @author Tobias Leupold + * @author Oliver Lillie (aka buggedcom) (original PHP 5 port) + */ + +class b8 +{ + + public $config = array( + 'min_size' => 3, + 'max_size' => 30, + 'allow_numbers' => FALSE, + 'lexer' => 'default', + 'degenerator' => 'default', + 'storage' => 'dba', + 'use_relevant' => 15, + 'min_dev' => 0.2, + 'rob_s' => 0.3, + 'rob_x' => 0.5 + ); + + private $_lexer = NULL; + private $_database = NULL; + private $_token_data = NULL; + + const SPAM = 'spam'; + const HAM = 'ham'; + const LEARN = 'learn'; + const UNLEARN = 'unlearn'; + + const STARTUP_FAIL_DATABASE = 'STARTUP_FAIL_DATABASE'; + const STARTUP_FAIL_LEXER = 'STARTUP_FAIL_LEXER'; + const TRAINER_CATEGORY_FAIL = 'TRAINER_CATEGORY_FAIL'; + + /** + * Constructs b8 + * + * @access public + * @return void + */ + + function __construct($config = array(), $database_config) + { + + # Validate config data + + if(count($config) > 0) { + + foreach ($config as $name=>$value) { + + switch($name) { + + case 'min_dev': + case 'rob_s': + case 'rob_x': + $this->config[$name] = (float) $value; + break; + + case 'min_size': + case 'max_size': + case 'use_relevant': + $this->config[$name] = (int) $value; + break; + + case 'allow_numbers': + $this->config[$name] = (bool) $value; + break; + + case 'lexer': + $value = (string) strtolower($value); + $this->config[$name] = is_file(dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . "lexer_" . $value . '.php') === TRUE ? $value : 'default'; + break; + + case 'storage': + $this->config[$name] = (string) $value; + break; + + } + + } + + } + + # Setup the database backend + + # Get the basic storage class used by all backends + if($this->load_class('b8_storage_base', dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_base.php') === FALSE) + return; + + # Get the degenerator we need + if($this->load_class('b8_degenerator_' . $this->config['degenerator'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'degenerator' . DIRECTORY_SEPARATOR . 'degenerator_' . $this->config['degenerator'] . '.php') === FALSE) + return; + + # Get the actual storage backend we need + if($this->load_class('b8_storage_' . $this->config['storage'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_' . $this->config['storage'] . '.php') === FALSE) + return; + + # Setup the backend + $class = 'b8_storage_' . $this->config['storage']; + $this->_database = new $class( + $database_config, + $this->config['degenerator'], date('ymd') + ); + + # Setup the lexer class + + if($this->load_class('b8_lexer_' . $this->config['lexer'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . 'lexer_' . $this->config['lexer'] . '.php') === FALSE) + return; + + $class = 'b8_lexer_' . $this->config['lexer']; + $this->_lexer = new $class( + array( + 'min_size' => $this->config['min_size'], + 'max_size' => $this->config['max_size'], + 'allow_numbers' => $this->config['allow_numbers'] + ) + ); + + } + + /** + * Load a class file if a class has not been defined yet. + * + * @access public + * @return boolean Returns TRUE if everything is okay, otherwise FALSE. + */ + + public function load_class($class_name, $class_file) + { + + if(class_exists($class_name, FALSE) === FALSE) { + + $included = require_once $class_file; + + if($included === FALSE or class_exists($class_name, FALSE) === FALSE) + return FALSE; + + } + + return TRUE; + + } + + /** + * Validates the class has all it needs to work. + * + * @access public + * @return mixed Returns TRUE if everything is okay, otherwise an error code. + */ + + public function validate() + { + + if($this->_database === NULL) + return self::STARTUP_FAIL_DATABASE; + + # Connect the database backend if we aren't connected yet + + elseif($this->_database->connected === FALSE) { + + $connection = $this->_database->connect(); + + if($connection !== TRUE) + return $connection; + + } + + if($this->_lexer === NULL) + return self::STARTUP_FAIL_LEXER; + + return TRUE; + + } + + /** + * Classifies a text + * + * @access public + * @package default + * @param string $text + * @return float The rating between 0 (ham) and 1 (spam) + */ + + public function classify($text) + { + + # Validate the startup + + $started_up = $this->validate(); + + if($started_up !== TRUE) + return $started_up; + + # Get the internal database variables, containing the number of ham and + # spam texts so the spam probability can be calculated in relation to them + $internals = $this->_database->get_internals(); + + # Calculate the spamminess of all tokens + + # Get all tokens we want to rate + + $tokens = $this->_lexer->get_tokens($text); + + # Check if the lexer failed + # (if so, $tokens will be a lexer error code, if not, $tokens will be an array) + if(!is_array($tokens)) + return $tokens; + + # Fetch all availible data for the token set from the database + $this->_token_data = $this->_database->get(array_keys($tokens)); + + # Calculate the spamminess and importance for each token (or a degenerated form of it) + + $word_count = array(); + $rating = array(); + $importance = array(); + + foreach($tokens as $word => $count) { + + $word_count[$word] = $count; + + # Although we only call this function only here ... let's do the + # calculation stuff in a function to make this a bit less confusing ;-) + $rating[$word] = $this->_get_probability($word, $internals['texts_ham'], $internals['texts_spam']); + + $importance[$word] = abs(0.5 - $rating[$word]); + + } + + # Order by importance + arsort($importance); + reset($importance); + + # Get the most interesting tokens (use all if we have less than the given number) + + $relevant = array(); + + for($i = 0; $i < $this->config['use_relevant']; $i++) { + + if($tmp = each($importance)) { + + # Important tokens remain + + # If the token's rating is relevant enough, use it + + if(abs(0.5 - $rating[$tmp['key']]) > $this->config['min_dev']) { + + # Tokens that appear more than once also count more than once + + for($x = 0, $l = $word_count[$tmp['key']]; $x < $l; $x++) + array_push($relevant, $rating[$tmp['key']]); + + } + + } + + else { + # We have less than words to use, so we already + # use what we have and can break here + break; + } + + } + + # Calculate the spamminess of the text (thanks to Mr. Robinson ;-) + # We set both hamminess and Spamminess to 1 for the first multiplying + $hamminess = 1; + $spamminess = 1; + + # Consider all relevant ratings + foreach($relevant as $value) { + $hamminess *= (1.0 - $value); + $spamminess *= $value; + } + + # If no token was good for calculation, we really don't know how + # to rate this text; so we assume a spam and ham probability of 0.5 + + if($hamminess === 1 and $spamminess === 1) { + $hamminess = 0.5; + $spamminess = 0.5; + $n = 1; + } + else { + # Get the number of relevant ratings + $n = count($relevant); + } + + # Calculate the combined rating + + # The actual hamminess and spamminess + $hamminess = 1 - pow($hamminess, (1 / $n)); + $spamminess = 1 - pow($spamminess, (1 / $n)); + + # Calculate the combined indicator + $probability = ($hamminess - $spamminess) / ($hamminess + $spamminess); + + # We want a value between 0 and 1, not between -1 and +1, so ... + $probability = (1 + $probability) / 2; + + # Alea iacta est + return $probability; + + } + + /** + * Calculate the spamminess of a single token also considering "degenerated" versions + * + * @access private + * @param string $word + * @param string $texts_ham + * @param string $texts_spam + * @return void + */ + + private function _get_probability($word, $texts_ham, $texts_spam) + { + + # Let's see what we have! + + if(isset($this->_token_data['tokens'][$word]) === TRUE) { + # The token was in the database, so we can use it's data as-is + # and calculate the spamminess of this token directly + return $this->_calc_probability($this->_token_data['tokens'][$word], $texts_ham, $texts_spam); + } + + # Damn. The token was not found, so do we have at least similar words? + + if(isset($this->_token_data['degenerates'][$word]) === TRUE) { + + # We found similar words, so calculate the spamminess for each one + # and choose the most important one for the further calculation + + # The default rating is 0.5 simply saying nothing + $rating = 0.5; + + foreach($this->_token_data['degenerates'][$word] as $degenerate => $count) { + + # Calculate the rating of the current degenerated token + $rating_tmp = $this->_calc_probability($count, $texts_ham, $texts_spam); + + # Is it more important than the rating of another degenerated version? + if(abs(0.5 - $rating_tmp) > abs(0.5 - $rating)) + $rating = $rating_tmp; + + } + + return $rating; + + } + + else { + # The token is really unknown, so choose the default rating + # for completely unknown tokens. This strips down to the + # robX parameter so we can cheap out the freaky math ;-) + return $this->config['rob_x']; + } + + } + + /** + * Do the actual spamminess calculation of a single token + * + * @access private + * @param array $data + * @param string $texts_ham + * @param string $texts_spam + * @return void + */ + + private function _calc_probability($data, $texts_ham, $texts_spam) + { + + # Calculate the basic probability by Mr. Graham + + # But: consider the number of ham and spam texts saved instead of the + # number of entries where the token appeared to calculate a relative + # spamminess because we count tokens appearing multiple times not just + # once but as often as they appear in the learned texts + + $rel_ham = $data['count_ham']; + $rel_spam = $data['count_spam']; + + if($texts_ham > 0) + $rel_ham = $data['count_ham'] / $texts_ham; + + if($texts_spam > 0) + $rel_spam = $data['count_spam'] / $texts_spam; + + $rating = $rel_spam / ($rel_ham + $rel_spam); + + # Calculate the better probability proposed by Mr. Robinson + $all = $data['count_ham'] + $data['count_spam']; + return (($this->config['rob_s'] * $this->config['rob_x']) + ($all * $rating)) / ($this->config['rob_s'] + $all); + + } + + /** + * Check the validity of the category of a request + * + * @access private + * @param string $category + * @return void + */ + + private function _check_category($category) + { + return $category === self::HAM or $category === self::SPAM; + } + + /** + * Learn a reference text + * + * @access public + * @param string $text + * @param const $category Either b8::SPAM or b8::HAM + * @return void + */ + + public function learn($text, $category) + { + return $this->_process_text($text, $category, self::LEARN); + } + + /** + * Unlearn a reference text + * + * @access public + * @param string $text + * @param const $category Either b8::SPAM or b8::HAM + * @return void + */ + + public function unlearn($text, $category) + { + return $this->_process_text($text, $category, self::UNLEARN); + } + + /** + * Does the actual interaction with the storage backend for learning or unlearning texts + * + * @access private + * @param string $text + * @param const $category Either b8::SPAM or b8::HAM + * @param const $action Either b8::LEARN or b8::UNLEARN + * @return void + */ + + private function _process_text($text, $category, $action) + { + + # Validate the startup + + $started_up = $this->validate(); + + if($started_up !== TRUE) + return $started_up; + + # Look if the request is okay + if($this->_check_category($category) === FALSE) + return self::TRAINER_CATEGORY_FAIL; + + # Get all tokens from $text + + $tokens = $this->_lexer->get_tokens($text); + + # Check if the lexer failed + # (if so, $tokens will be a lexer error code, if not, $tokens will be an array) + if(!is_array($tokens)) + return $tokens; + + # Pass the tokens and what to do with it to the storage backend + return $this->_database->process_text($tokens, $category, $action); + + } + +} + +?>
\ No newline at end of file diff --git a/library/spam/b8/degenerator/degenerator_default.php b/library/spam/b8/degenerator/degenerator_default.php new file mode 100644 index 000000000..4ff6d882b --- /dev/null +++ b/library/spam/b8/degenerator/degenerator_default.php @@ -0,0 +1,127 @@ +<?php + +# Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de> +# +# This file is part of the b8 package +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation in version 2.1 of the License. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +/** + * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de> + * + * @license LGPL + * @access public + * @package b8 + * @author Tobias Leupold + */ + +class b8_degenerator_default +{ + + public $degenerates = array(); + + /** + * Generates a list of "degenerated" words for a list of words. + * + * @access public + * @param array $tokens + * @return array An array containing an array of degenerated tokens for each token + */ + + public function degenerate(array $words) + { + + $degenerates = array(); + + foreach($words as $word) + $degenerates[$word] = $this->_degenerate_word($word); + + return $degenerates; + + } + + /** + * If the original word is not found in the database then + * we build "degenerated" versions of the word to lookup. + * + * @access private + * @param string $word + * @return array An array of degenerated words + */ + + protected function _degenerate_word($word) + { + + # Check for any stored words so the process doesn't have to repeat + if(isset($this->degenerates[$word]) === TRUE) + return $this->degenerates[$word]; + + $degenerate = array(); + + # Add different version of upper and lower case and ucfirst + array_push($degenerate, strtolower($word)); + array_push($degenerate, strtoupper($word)); + array_push($degenerate, ucfirst($word)); + + # Degenerate all versions + + foreach($degenerate as $alt_word) { + + # Look for stuff like !!! and ??? + + if(preg_match('/[!?]$/', $alt_word) > 0) { + + # Add versions with different !s and ?s + + if(preg_match('/[!?]{2,}$/', $alt_word) > 0) { + $tmp = preg_replace('/([!?])+$/', '$1', $alt_word); + array_push($degenerate, $tmp); + } + + $tmp = preg_replace('/([!?])+$/', '', $alt_word); + array_push($degenerate, $tmp); + + } + + # Look for ... at the end of the word + + $alt_word_int = $alt_word; + + while(preg_match('/[\.]$/', $alt_word_int) > 0) { + $alt_word_int = substr($alt_word_int, 0, strlen($alt_word_int) - 1); + array_push($degenerate, $alt_word_int); + } + + } + + # Some degenerates are the same as the original word. These don't have + # to be fetched, so we create a new array with only new tokens + + $real_degenerate = array(); + + foreach($degenerate as $deg_word) { + if($word != $deg_word) + array_push($real_degenerate, $deg_word); + } + + # Store the list of degenerates for the token + $this->degenerates[$word] = $real_degenerate; + + return $real_degenerate; + + } + +} + +?>
\ No newline at end of file diff --git a/library/spam/b8/lexer/lexer_default.php b/library/spam/b8/lexer/lexer_default.php new file mode 100644 index 000000000..7b5ca22bf --- /dev/null +++ b/library/spam/b8/lexer/lexer_default.php @@ -0,0 +1,205 @@ +<?php + +# Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de> +# +# This file is part of the b8 package +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation in version 2.1 of the License. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +/** + * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de> + * + * @license LGPL + * @access public + * @package b8 + * @author Tobias Leupold + * @author Oliver Lillie (aka buggedcom) (original PHP 5 port) + */ + +class b8_lexer_default +{ + + const LEXER_TEXT_NOT_STRING = 'LEXER_TEXT_NOT_STRING'; + const LEXER_TEXT_EMPTY = 'LEXER_TEXT_EMPTY'; + + public $config = NULL; + + # The regular expressions we use to split the text to tokens + + public $regexp = array( + 'ip' => '/([A-Za-z0-9\_\-\.]+)/', + 'raw_split' => '/[\s,\.\/"\:;\|<>\-_\[\]{}\+=\)\(\*\&\^%]+/', + 'html' => '/(<.+?>)/', + 'tagname' => '/(.+?)\s/', + 'numbers' => '/^[0-9]+$/' + ); + + /** + * Constructs the lexer. + * + * @access public + * @return void + */ + + function __construct($config) + { + $this->config = $config; + } + + /** + * Generates the tokens required for the bayesian filter. + * + * @access public + * @param string $text + * @return array Returns the list of tokens + */ + + public function get_tokens($text) + { + + # Check that we actually have a string ... + if(is_string($text) === FALSE) + return self::LEXER_TEXT_NOT_STRING; + + # ... and that it's not empty + if(empty($text) === TRUE) + return self::LEXER_TEXT_EMPTY; + + # Re-convert the text to the original characters coded in UTF-8, as + # they have been coded in html entities during the post process + $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8'); + + $tokens = array(); + + # Find URLs and IP addresses + + preg_match_all($this->regexp['ip'], $text, $raw_tokens); + + foreach($raw_tokens[1] as $word) { + + # Check for a dot + if(strpos($word, '.') === FALSE) + continue; + + # Check that the word is valid, min and max sizes, etc. + if($this->_is_valid($word) === FALSE) + continue; + + if(isset($tokens[$word]) === FALSE) + $tokens[$word] = 1; + else + $tokens[$word] += 1; + + # Delete the word from the text so it doesn't get re-added. + $text = str_replace($word, '', $text); + + # Also process the parts of the URLs + $url_parts = preg_split($this->regexp['raw_split'], $word); + + foreach($url_parts as $word) { + + # Again validate the part + + if($this->_is_valid($word) === FALSE) + continue; + + if(isset($tokens[$word]) === FALSE) + $tokens[$word] = 1; + else + $tokens[$word] += 1; + + } + + } + + # Split the remaining text + + $raw_tokens = preg_split($this->regexp['raw_split'], $text); + + foreach($raw_tokens as $word) { + + # Again validate the part + + if($this->_is_valid($word) === FALSE) + continue; + + if(isset($tokens[$word]) === FALSE) + $tokens[$word] = 1; + else + $tokens[$word] += 1; + + } + + # Process the HTML + + preg_match_all($this->regexp['html'], $text, $raw_tokens); + + foreach($raw_tokens[1] as $word) { + + # Again validate the part + + if($this->_is_valid($word) === FALSE) + continue; + + # If the tag has parameters, just use the tag itself + + if(strpos($word, ' ') !== FALSE) { + preg_match($this->regexp['tagname'], $word, $tmp); + $word = "{$tmp[1]}...>"; + } + + if(isset($tokens[$word]) === FALSE) + $tokens[$word] = 1; + else + $tokens[$word] += 1; + + } + + # Return a list of all found tokens + return $tokens; + + } + + /** + * Validates a token. + * + * @access private + * @param string $token The token string. + * @return boolean Returns TRUE if the token is valid, otherwise returns FALSE + */ + + private function _is_valid($token) + { + + # Validate the size of the token + + $len = strlen($token); + + if($len < $this->config['min_size'] or $len > $this->config['max_size']) + return FALSE; + + # We may want to exclude pure numbers + if($this->config['allow_numbers'] === FALSE) { + if(preg_match($this->regexp['numbers'], $token) > 0) + return FALSE; + } + + # Token is okay + return TRUE; + + } + +} + +?>
\ No newline at end of file diff --git a/library/spam/b8/storage/storage_base.php b/library/spam/b8/storage/storage_base.php new file mode 100644 index 000000000..01f5a69d7 --- /dev/null +++ b/library/spam/b8/storage/storage_base.php @@ -0,0 +1,395 @@ +<?php + +# Copyright (C) 2010 Tobias Leupold <tobias.leupold@web.de> +# +# This file is part of the b8 package +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation in version 2.1 of the License. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +/** + * Functions used by all storage backends + * Copyright (C) 2010 Tobias Leupold <tobias.leupold@web.de> + * + * @license LGPL + * @access public + * @package b8 + * @author Tobias Leupold + */ + +abstract class b8_storage_base +{ + + public $connected = FALSE; + + protected $_degenerator = NULL; + + const INTERNALS_TEXTS_HAM = 'bayes*texts.ham'; + const INTERNALS_TEXTS_SPAM = 'bayes*texts.spam'; + const INTERNALS_DBVERSION = 'bayes*dbversion'; + + const BACKEND_NOT_CONNECTED = 'BACKEND_NOT_CONNECTED'; + const DATABASE_WRONG_VERSION = 'DATABASE_WRONG_VERSION'; + const DATABASE_NOT_B8 = 'DATABASE_NOT_B8'; + + /** + * Validates the class has all it needs to work. + * + * @access protected + * @return mixed Returns TRUE if everything is okay, otherwise an error code. + */ + + protected function validate() + { + + # We set up the degenerator here, as we would have to duplicate code if it + # was done in the constructor of the respective storage backend. + $class = 'b8_degenerator_' . $this->b8_config['degenerator']; + $this->_degenerator = new $class(); + + if($this->connected !== TRUE) + return self::BACKEND_NOT_CONNECTED; + + return TRUE; + + } + + /** + * Checks if a b8 database is used and if it's version is okay + * + * @access protected + * @return mixed Returns TRUE if everything is okay, otherwise an error code. + */ + + protected function check_database() + { + + $internals = $this->get_internals(); + + if(isset($internals['dbversion'])) { + if($internals['dbversion'] == "2") { + return TRUE; + } + else { + $this->connected = FALSE; + return self::DATABASE_WRONG_VERSION; + } + } + else { + $this->connected = FALSE; + return self::DATABASE_NOT_B8; + } + + } + + /** + * Parses the "count" data of a token. + * + * @access private + * @param string $data + * @return array Returns an array of the parsed data: array(count_ham, count_spam, lastseen). + */ + + private function _parse_count($data) + { + + list($count_ham, $count_spam, $lastseen) = explode(' ', $data); + + $count_ham = (int) $count_ham; + $count_spam = (int) $count_spam; + + return array( + 'count_ham' => $count_ham, + 'count_spam' => $count_spam + ); + + } + + /** + * Get the database's internal variables. + * + * @access public + * @return array Returns an array of all internals. + */ + + public function get_internals() + { + + $internals = $this->_get_query( + array( + self::INTERNALS_TEXTS_HAM, + self::INTERNALS_TEXTS_SPAM, + self::INTERNALS_DBVERSION + ) + ); + + return array( + 'texts_ham' => (int) $internals[self::INTERNALS_TEXTS_HAM], + 'texts_spam' => (int) $internals[self::INTERNALS_TEXTS_SPAM], + 'dbversion' => (int) $internals[self::INTERNALS_DBVERSION] + ); + + } + + /** + * Get all data about a list of tags from the database. + * + * @access public + * @param array $tokens + * @return mixed Returns FALSE on failure, otherwise returns array of returned data in the format array('tokens' => array(token => count), 'degenerates' => array(token => array(degenerate => count))). + */ + + public function get($tokens) + { + + # Validate the startup + + $started_up = $this->validate(); + + if($started_up !== TRUE) + return $started_up; + + # First we see what we have in the database. + $token_data = $this->_get_query($tokens); + + # Check if we have to degenerate some tokens + + $missing_tokens = array(); + + foreach($tokens as $token) { + if(!isset($token_data[$token])) + $missing_tokens[] = $token; + } + + if(count($missing_tokens) > 0) { + + # We have to degenerate some tokens + $degenerates_list = array(); + + # Generate a list of degenerated tokens for the missing tokens ... + $degenerates = $this->_degenerator->degenerate($missing_tokens); + + # ... and look them up + + foreach($degenerates as $token => $token_degenerates) + $degenerates_list = array_merge($degenerates_list, $token_degenerates); + + $token_data = array_merge($token_data, $this->_get_query($degenerates_list)); + + } + + # Here, we have all availible data in $token_data. + + $return_data_tokens = array(); + $return_data_degenerates = array(); + + foreach($tokens as $token) { + + if(isset($token_data[$token]) === TRUE) { + + # The token was found in the database + + # Add the data ... + $return_data_tokens[$token] = $this->_parse_count($token_data[$token]); + + # ... and update it's lastseen parameter + $this->_update($token, "{$return_data_tokens[$token]['count_ham']} {$return_data_tokens[$token]['count_spam']} " . $this->b8_config['today']); + + } + + else { + + # The token was not found, so we look if we + # can return data for degenerated tokens + + # Check all degenerated forms of the token + + foreach($this->_degenerator->degenerates[$token] as $degenerate) { + + if(isset($token_data[$degenerate]) === TRUE) { + + # A degeneration of the token way found in the database + + # Add the data ... + $return_data_degenerates[$token][$degenerate] = $this->_parse_count($token_data[$degenerate]); + + # ... and update it's lastseen parameter + $this->_update($degenerate, "{$return_data_degenerates[$token][$degenerate]['count_ham']} {$return_data_degenerates[$token][$degenerate]['count_spam']} " . $this->b8_config['today']); + + } + + } + + } + + } + + # Now, all token data directly found in the database is in $return_data_tokens + # and all data for degenerated versions is in $return_data_degenerates + + # First, we commit the changes to the lastseen parameters + $this->_commit(); + + # Then, we return what we have + return array( + 'tokens' => $return_data_tokens, + 'degenerates' => $return_data_degenerates + ); + + } + + /** + * Stores or deletes a list of tokens from the given category. + * + * @access public + * @param array $tokens + * @param const $category Either b8::HAM or b8::SPAM + * @param const $action Either b8::LEARN or b8::UNLEARN + * @return void + */ + + public function process_text($tokens, $category, $action) + { + + # Validate the startup + + $started_up = $this->validate(); + + if($started_up !== TRUE) + return $started_up; + + # No matter what we do, we first have to check what data we have. + + # First get the internals, including the ham texts and spam texts counter + $internals = $this->get_internals(); + + # Then, fetch all data for all tokens we have (and update their lastseen parameters) + $token_data = $this->_get_query(array_keys($tokens)); + + # Process all tokens to learn/unlearn + + foreach($tokens as $token => $count) { + + if(isset($token_data[$token])) { + + # We already have this token, so update it's data + + # Get the existing data + list($count_ham, $count_spam, $lastseen) = explode(' ', $token_data[$token]); + $count_ham = (int) $count_ham; + $count_spam = (int) $count_spam; + + # Increase or decrease the right counter + + if($action === b8::LEARN) { + if($category === b8::HAM) + $count_ham += $count; + elseif($category === b8::SPAM) + $count_spam += $count; + } + + elseif($action == b8::UNLEARN) { + if($category === b8::HAM) + $count_ham -= $count; + elseif($category === b8::SPAM) + $count_spam -= $count; + } + + # We don't want to have negative values + + if($count_ham < 0) + $count_ham = 0; + + if($count_spam < 0) + $count_spam = 0; + + # Now let's see if we have to update or delete the token + if($count_ham !== 0 or $count_spam !== 0) + $this->_update($token, "$count_ham $count_spam " . $this->b8_config['today']); + else + $this->_del($token); + + } + + else { + + # We don't have the token. If we unlearn a text, we can't delete it + # as we don't have it anyway, so just do something if we learn a text + + if($action === b8::LEARN) { + + if($category === b8::HAM) + $data = '1 0 '; + elseif($category === b8::SPAM) + $data = '0 1 '; + + $data .= $this->b8_config['today']; + + $this->_put($token, $data); + + } + + } + + } + + # Now, all token have been processed, so let's update the right text + + if($action === b8::LEARN) { + + if($category === b8::HAM) { + $internals['texts_ham']++; + $this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham']); + } + + elseif($category === b8::SPAM) { + $internals['texts_spam']++; + $this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam']); + } + + } + + elseif($action == b8::UNLEARN) { + + if($category === b8::HAM) { + + $internals['texts_ham']--; + + if($internals['texts_ham'] < 0) + $internals['texts_ham'] = 0; + + $this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham']); + + } + + elseif($category === b8::SPAM) { + + $internals['texts_spam']--; + + if($internals['texts_spam'] < 0) + $internals['texts_spam'] = 0; + + $this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam']); + + } + + } + + # We're done and can commit all changes to the database now + $this->_commit(); + + } + +} + +?>
\ No newline at end of file diff --git a/library/spam/b8/storage/storage_dba.php b/library/spam/b8/storage/storage_dba.php new file mode 100644 index 000000000..04618b23e --- /dev/null +++ b/library/spam/b8/storage/storage_dba.php @@ -0,0 +1,198 @@ +<?php + +# Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de> +# +# This file is part of the b8 package +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation in version 2.1 of the License. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +/** + * The DBA (Berkeley DB) abstraction layer for communicating with the database. + * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de> + * + * @license LGPL + * @access public + * @package b8 + * @author Tobias Leupold + */ + +class b8_storage_dba extends b8_storage_base +{ + + public $config = array( + 'database' => 'wordlist.db', + 'handler' => 'db4', + ); + + public $b8_config = array( + 'degenerator' => NULL, + 'today' => NULL + ); + + private $_db = NULL; + + const DATABASE_CONNECTION_FAIL = 'DATABASE_CONNECTION_FAIL'; + + /** + * Constructs the database layer. + * + * @access public + * @param string $config + */ + + function __construct($config, $degenerator, $today) + { + + # Pass some variables of the main b8 config to this class + $this->b8_config['degenerator'] = $degenerator; + $this->b8_config['today'] = $today; + + # Validate the config items + if(count($config) > 0) { + foreach ($config as $name => $value) { + $this->config[$name] = (string) $value; + } + } + + } + + /** + * Closes the database connection. + * + * @access public + * @return void + */ + + function __destruct() + { + if($this->_db !== NULL) { + dba_close($this->_db); + $this->connected = FALSE; + } + } + + /** + * Connect to the database and do some checks. + * + * @access public + * @return mixed Returns TRUE on a successful database connection, otherwise returns a constant from b8. + */ + + public function connect() + { + + # Have we already connected? + if($this->_db !== NULL) + return TRUE; + + # Open the database connection + $this->_db = dba_open(dirname(__FILE__) . DIRECTORY_SEPARATOR . ".." . DIRECTORY_SEPARATOR . $this->config['database'], "w", $this->config['handler']); + + if($this->_db === FALSE) { + $this->connected = FALSE; + $this->_db = NULL; + return self::DATABASE_CONNECTION_FAIL; + } + + # Everything is okay and connected + + $this->connected = TRUE; + + # Let's see if this is a b8 database and the version is okay + return $this->check_database(); + + } + + /** + * Does the actual interaction with the database when fetching data. + * + * @access protected + * @param array $tokens + * @return mixed Returns an array of the returned data in the format array(token => data) or an empty array if there was no data. + */ + + protected function _get_query($tokens) + { + + $data = array(); + + foreach ($tokens as $token) { + + $count = dba_fetch($token, $this->_db); + + if($count !== FALSE) + $data[$token] = $count; + + } + + return $data; + + } + + /** + * Store a token to the database. + * + * @access protected + * @param string $token + * @param string $count + * @return bool TRUE on success or FALSE on failure + */ + + protected function _put($token, $count) { + return dba_insert($token, $count, $this->_db); + } + + /** + * Update an existing token. + * + * @access protected + * @param string $token + * @param string $count + * @return bool TRUE on success or FALSE on failure + */ + + protected function _update($token, $count) + { + return dba_replace($token, $count, $this->_db); + } + + /** + * Remove a token from the database. + * + * @access protected + * @param string $token + * @return bool TRUE on success or FALSE on failure + */ + + protected function _del($token) + { + return dba_delete($token, $this->_db); + } + + /** + * Does nothing :-D + * + * @access protected + * @return void + */ + + protected function _commit() + { + # We just need this function because the (My)SQL backend(s) need it. + return; + } + +} + +?>
\ No newline at end of file diff --git a/library/spam/b8/storage/storage_mysql.php b/library/spam/b8/storage/storage_mysql.php new file mode 100644 index 000000000..022536350 --- /dev/null +++ b/library/spam/b8/storage/storage_mysql.php @@ -0,0 +1,351 @@ +<?php + +# Copyright (C) 2006-2011 Tobias Leupold <tobias.leupold@web.de> +# +# This file is part of the b8 package +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation in version 2.1 of the License. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +/** + * The MySQL abstraction layer for communicating with the database. + * Copyright (C) 2009 Oliver Lillie (aka buggedcom) + * Copyright (C) 2010-2011 Tobias Leupold <tobias.leupold@web.de> + * + * @license LGPL + * @access public + * @package b8 + * @author Oliver Lillie (aka buggedcom) (original PHP 5 port and optimizations) + * @author Tobias Leupold + */ + +class b8_storage_mysql extends b8_storage_base +{ + + public $config = array( + 'database' => 'b8_wordlist', + 'table_name' => 'b8_wordlist', + 'host' => 'localhost', + 'user' => FALSE, + 'pass' => FALSE, + 'connection' => NULL + ); + + public $b8_config = array( + 'degenerator' => NULL, + 'today' => NULL + ); + + private $_connection = NULL; + private $_deletes = array(); + private $_puts = array(); + private $_updates = array(); + + const DATABASE_CONNECTION_FAIL = 'DATABASE_CONNECTION_FAIL'; + const DATABASE_CONNECTION_ERROR = 'DATABASE_CONNECTION_ERROR'; + const DATABASE_CONNECTION_BAD_RESOURCE = 'DATABASE_CONNECTION_BAD_RESOURCE'; + const DATABASE_SELECT_ERROR = 'DATABASE_SELECT_ERROR'; + const DATABASE_TABLE_ACCESS_FAIL = 'DATABASE_TABLE_ACCESS_FAIL'; + const DATABASE_WRONG_VERSION = 'DATABASE_WRONG_VERSION'; + + /** + * Constructs the database layer. + * + * @access public + * @param string $config + */ + + function __construct($config, $degenerator, $today) + { + + # Pass some variables of the main b8 config to this class + $this->b8_config['degenerator'] = $degenerator; + $this->b8_config['today'] = $today; + + # Validate the config items + + if(count($config) > 0) { + + foreach ($config as $name => $value) { + + switch($name) { + + case 'table_name': + case 'host': + case 'user': + case 'pass': + case 'database': + $this->config[$name] = (string) $value; + break; + + case 'connection': + + if($value !== NULL) { + + if(is_resource($value) === TRUE) { + $resource_type = get_resource_type($value); + $this->config['connection'] = $resource_type !== 'mysql link' && $resource_type !== 'mysql link persistent' ? FALSE : $value; + } + + else + $this->config['connection'] = FALSE; + + } + + break; + + } + + } + + } + + } + + /** + * Closes the database connection. + * + * @access public + * @return void + */ + + function __destruct() + { + + if($this->_connection === NULL) + return; + + # Commit any changes before closing + $this->_commit(); + + # Just close the connection if no link-resource was passed and b8 created it's own connection + if($this->config['connection'] === NULL) + mysql_close($this->_connection); + + $this->connected = FALSE; + + } + + /** + * Connect to the database and do some checks. + * + * @access public + * @return mixed Returns TRUE on a successful database connection, otherwise returns a constant from b8. + */ + + public function connect() + { + + # Are we already connected? + if($this->connected === TRUE) + return TRUE; + + # Are we using an existing passed resource? + if($this->config['connection'] === FALSE) { + # ... yes we are, but the connection is not a resource, so return an error + $this->connected = FALSE; + return self::DATABASE_CONNECTION_BAD_RESOURCE; + } + + elseif($this->config['connection'] === NULL) { + + # ... no we aren't so we have to connect. + + if($this->_connection = mysql_connect($this->config['host'], $this->config['user'], $this->config['pass'])) { + if(mysql_select_db($this->config['database'], $this->_connection) === FALSE) { + $this->connected = FALSE; + return self::DATABASE_SELECT_ERROR . ": " . mysql_error(); + } + } + else { + $this->connected = FALSE; + return self::DATABASE_CONNECTION_ERROR; + } + + } + + else { + # ... yes we are + $this->_connection = $this->config['connection']; + } + + # Just in case ... + if($this->_connection === NULL) { + $this->connected = FALSE; + return self::DATABASE_CONNECTION_FAIL; + } + + # Check to see if the wordlist table exists + if(mysql_query('DESCRIBE ' . $this->config['table_name'], $this->_connection) === FALSE) { + $this->connected = FALSE; + return self::DATABASE_TABLE_ACCESS_FAIL . ": " . mysql_error(); + } + + # Everything is okay and connected + $this->connected = TRUE; + + # Let's see if this is a b8 database and the version is okay + return $this->check_database(); + + } + + /** + * Does the actual interaction with the database when fetching data. + * + * @access protected + * @param array $tokens + * @return mixed Returns an array of the returned data in the format array(token => data) or an empty array if there was no data. + */ + + protected function _get_query($tokens) + { + + # Construct the query ... + + if(count($tokens) > 0) { + + $where = array(); + + foreach ($tokens as $token) { + $token = mysql_real_escape_string($token, $this->_connection); + array_push($where, $token); + } + + $where = 'token IN ("' . implode('", "', $where) . '")'; + } + + else { + $token = mysql_real_escape_string($token, $this->_connection); + $where = 'token = "' . $token . '"'; + } + + # ... and fetch the data + + $result = mysql_query(' + SELECT token, count + FROM ' . $this->config['table_name'] . ' + WHERE ' . $where . '; + ', $this->_connection); + + $data = array(); + + while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) + $data[$row['token']] = $row['count']; + + mysql_free_result($result); + + return $data; + + } + + /** + * Store a token to the database. + * + * @access protected + * @param string $token + * @param string $count + * @return void + */ + + protected function _put($token, $count) { + $token = mysql_real_escape_string($token, $this->_connection); + $count = mysql_real_escape_string($count, $this->_connection);; + array_push($this->_puts, '("' . $token . '", "' . $count . '")'); + } + + /** + * Update an existing token. + * + * @access protected + * @param string $token + * @param string $count + * @return void + */ + + protected function _update($token, $count) + { + $token = mysql_real_escape_string($token, $this->_connection); + $count = mysql_real_escape_string($count, $this->_connection); + array_push($this->_updates, '("' . $token . '", "' . $count . '")'); + } + + /** + * Remove a token from the database. + * + * @access protected + * @param string $token + * @return void + */ + + protected function _del($token) + { + $token = mysql_real_escape_string($token, $this->_connection); + array_push($this->_deletes, $token); + } + + /** + * Commits any modification queries. + * + * @access protected + * @return void + */ + + protected function _commit() + { + + if(count($this->_deletes) > 0) { + + $result = mysql_query(' + DELETE FROM ' . $this->config['table_name'] . ' + WHERE token IN ("' . implode('", "', $this->_deletes) . '"); + ', $this->_connection); + + if(is_resource($result) === TRUE) + mysql_free_result($result); + + $this->_deletes = array(); + + } + + if(count($this->_puts) > 0) { + + $result = mysql_query(' + INSERT INTO ' . $this->config['table_name'] . '(token, count) + VALUES ' . implode(', ', $this->_puts) . ';', $this->_connection); + + if(is_resource($result) === TRUE) + mysql_free_result($result); + + $this->_puts = array(); + + } + + if(count($this->_updates) > 0) { + + $result = mysql_query(' + INSERT INTO ' . $this->config['table_name'] . '(token, count) + VALUES ' . implode(', ', $this->_updates) . ' + ON DUPLICATE KEY UPDATE ' . $this->config['table_name'] . '.count = VALUES(count);', $this->_connection); + + if(is_resource($result) === TRUE) + mysql_free_result($result); + + $this->_updates = array(); + + } + + } + +} + +?>
\ No newline at end of file |