diff options
Diffstat (limited to 'library/spam/b8/b8.php')
-rw-r--r-- | library/spam/b8/b8.php | 503 |
1 files changed, 503 insertions, 0 deletions
diff --git a/library/spam/b8/b8.php b/library/spam/b8/b8.php new file mode 100644 index 000000000..28a3dd29f --- /dev/null +++ b/library/spam/b8/b8.php @@ -0,0 +1,503 @@ +<?php + +# Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de> +# +# b8 - A Bayesian spam filter written in PHP 5 +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation in version 2.1 of the License. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +/** + * Copyright (C) 2006-2010 Tobias Leupold <tobias.leupold@web.de> + * + * @license LGPL + * @access public + * @package b8 + * @author Tobias Leupold + * @author Oliver Lillie (aka buggedcom) (original PHP 5 port) + */ + +class b8 +{ + + public $config = array( + 'min_size' => 3, + 'max_size' => 30, + 'allow_numbers' => FALSE, + 'lexer' => 'default', + 'degenerator' => 'default', + 'storage' => 'dba', + 'use_relevant' => 15, + 'min_dev' => 0.2, + 'rob_s' => 0.3, + 'rob_x' => 0.5 + ); + + private $_lexer = NULL; + private $_database = NULL; + private $_token_data = NULL; + + const SPAM = 'spam'; + const HAM = 'ham'; + const LEARN = 'learn'; + const UNLEARN = 'unlearn'; + + const STARTUP_FAIL_DATABASE = 'STARTUP_FAIL_DATABASE'; + const STARTUP_FAIL_LEXER = 'STARTUP_FAIL_LEXER'; + const TRAINER_CATEGORY_FAIL = 'TRAINER_CATEGORY_FAIL'; + + /** + * Constructs b8 + * + * @access public + * @return void + */ + + function __construct($config = array(), $database_config) + { + + # Validate config data + + if(count($config) > 0) { + + foreach ($config as $name=>$value) { + + switch($name) { + + case 'min_dev': + case 'rob_s': + case 'rob_x': + $this->config[$name] = (float) $value; + break; + + case 'min_size': + case 'max_size': + case 'use_relevant': + $this->config[$name] = (int) $value; + break; + + case 'allow_numbers': + $this->config[$name] = (bool) $value; + break; + + case 'lexer': + $value = (string) strtolower($value); + $this->config[$name] = is_file(dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . "lexer_" . $value . '.php') === TRUE ? $value : 'default'; + break; + + case 'storage': + $this->config[$name] = (string) $value; + break; + + } + + } + + } + + # Setup the database backend + + # Get the basic storage class used by all backends + if($this->load_class('b8_storage_base', dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_base.php') === FALSE) + return; + + # Get the degenerator we need + if($this->load_class('b8_degenerator_' . $this->config['degenerator'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'degenerator' . DIRECTORY_SEPARATOR . 'degenerator_' . $this->config['degenerator'] . '.php') === FALSE) + return; + + # Get the actual storage backend we need + if($this->load_class('b8_storage_' . $this->config['storage'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'storage' . DIRECTORY_SEPARATOR . 'storage_' . $this->config['storage'] . '.php') === FALSE) + return; + + # Setup the backend + $class = 'b8_storage_' . $this->config['storage']; + $this->_database = new $class( + $database_config, + $this->config['degenerator'], date('ymd') + ); + + # Setup the lexer class + + if($this->load_class('b8_lexer_' . $this->config['lexer'], dirname(__FILE__) . DIRECTORY_SEPARATOR . 'lexer' . DIRECTORY_SEPARATOR . 'lexer_' . $this->config['lexer'] . '.php') === FALSE) + return; + + $class = 'b8_lexer_' . $this->config['lexer']; + $this->_lexer = new $class( + array( + 'min_size' => $this->config['min_size'], + 'max_size' => $this->config['max_size'], + 'allow_numbers' => $this->config['allow_numbers'] + ) + ); + + } + + /** + * Load a class file if a class has not been defined yet. + * + * @access public + * @return boolean Returns TRUE if everything is okay, otherwise FALSE. + */ + + public function load_class($class_name, $class_file) + { + + if(class_exists($class_name, FALSE) === FALSE) { + + $included = require_once $class_file; + + if($included === FALSE or class_exists($class_name, FALSE) === FALSE) + return FALSE; + + } + + return TRUE; + + } + + /** + * Validates the class has all it needs to work. + * + * @access public + * @return mixed Returns TRUE if everything is okay, otherwise an error code. + */ + + public function validate() + { + + if($this->_database === NULL) + return self::STARTUP_FAIL_DATABASE; + + # Connect the database backend if we aren't connected yet + + elseif($this->_database->connected === FALSE) { + + $connection = $this->_database->connect(); + + if($connection !== TRUE) + return $connection; + + } + + if($this->_lexer === NULL) + return self::STARTUP_FAIL_LEXER; + + return TRUE; + + } + + /** + * Classifies a text + * + * @access public + * @package default + * @param string $text + * @return float The rating between 0 (ham) and 1 (spam) + */ + + public function classify($uid,$text) + { + + # Validate the startup + + $started_up = $this->validate(); + + if($started_up !== TRUE) + return $started_up; + + # Get the internal database variables, containing the number of ham and + # spam texts so the spam probability can be calculated in relation to them + $internals = $this->_database->get_internals($uid); + + # Calculate the spamminess of all tokens + + # Get all tokens we want to rate + + $tokens = $this->_lexer->get_tokens($text); + + # Check if the lexer failed + # (if so, $tokens will be a lexer error code, if not, $tokens will be an array) + if(!is_array($tokens)) + return $tokens; + + # Fetch all availible data for the token set from the database + $this->_token_data = $this->_database->get(array_keys($tokens),$uid); + + # Calculate the spamminess and importance for each token (or a degenerated form of it) + + $word_count = array(); + $rating = array(); + $importance = array(); + + foreach($tokens as $word => $count) { + + $word_count[$word] = $count; + + # Although we only call this function only here ... let's do the + # calculation stuff in a function to make this a bit less confusing ;-) + $rating[$word] = $this->_get_probability($word, $internals['texts_ham'], $internals['texts_spam']); + + $importance[$word] = abs(0.5 - $rating[$word]); + + } + + # Order by importance + arsort($importance); + reset($importance); + + # Get the most interesting tokens (use all if we have less than the given number) + + $relevant = array(); + + for($i = 0; $i < $this->config['use_relevant']; $i++) { + + if($tmp = each($importance)) { + + # Important tokens remain + + # If the token's rating is relevant enough, use it + + if(abs(0.5 - $rating[$tmp['key']]) > $this->config['min_dev']) { + + # Tokens that appear more than once also count more than once + + for($x = 0, $l = $word_count[$tmp['key']]; $x < $l; $x++) + array_push($relevant, $rating[$tmp['key']]); + + } + + } + + else { + # We have less than words to use, so we already + # use what we have and can break here + break; + } + + } + + # Calculate the spamminess of the text (thanks to Mr. Robinson ;-) + # We set both hamminess and Spamminess to 1 for the first multiplying + $hamminess = 1; + $spamminess = 1; + + # Consider all relevant ratings + foreach($relevant as $value) { + $hamminess *= (1.0 - $value); + $spamminess *= $value; + } + + # If no token was good for calculation, we really don't know how + # to rate this text; so we assume a spam and ham probability of 0.5 + + if($hamminess === 1 and $spamminess === 1) { + $hamminess = 0.5; + $spamminess = 0.5; + $n = 1; + } + else { + # Get the number of relevant ratings + $n = count($relevant); + } + + # Calculate the combined rating + + # The actual hamminess and spamminess + $hamminess = 1 - pow($hamminess, (1 / $n)); + $spamminess = 1 - pow($spamminess, (1 / $n)); + + # Calculate the combined indicator + $probability = ($hamminess - $spamminess) / ($hamminess + $spamminess); + + # We want a value between 0 and 1, not between -1 and +1, so ... + $probability = (1 + $probability) / 2; + + # Alea iacta est + return $probability; + + } + + /** + * Calculate the spamminess of a single token also considering "degenerated" versions + * + * @access private + * @param string $word + * @param string $texts_ham + * @param string $texts_spam + * @return void + */ + + private function _get_probability($word, $texts_ham, $texts_spam) + { + + # Let's see what we have! + + if(isset($this->_token_data['tokens'][$word]) === TRUE) { + # The token was in the database, so we can use it's data as-is + # and calculate the spamminess of this token directly + return $this->_calc_probability($this->_token_data['tokens'][$word], $texts_ham, $texts_spam); + } + + # Damn. The token was not found, so do we have at least similar words? + + if(isset($this->_token_data['degenerates'][$word]) === TRUE) { + + # We found similar words, so calculate the spamminess for each one + # and choose the most important one for the further calculation + + # The default rating is 0.5 simply saying nothing + $rating = 0.5; + + foreach($this->_token_data['degenerates'][$word] as $degenerate => $count) { + + # Calculate the rating of the current degenerated token + $rating_tmp = $this->_calc_probability($count, $texts_ham, $texts_spam); + + # Is it more important than the rating of another degenerated version? + if(abs(0.5 - $rating_tmp) > abs(0.5 - $rating)) + $rating = $rating_tmp; + + } + + return $rating; + + } + + else { + # The token is really unknown, so choose the default rating + # for completely unknown tokens. This strips down to the + # robX parameter so we can cheap out the freaky math ;-) + return $this->config['rob_x']; + } + + } + + /** + * Do the actual spamminess calculation of a single token + * + * @access private + * @param array $data + * @param string $texts_ham + * @param string $texts_spam + * @return void + */ + + private function _calc_probability($data, $texts_ham, $texts_spam) + { + + # Calculate the basic probability by Mr. Graham + + # But: consider the number of ham and spam texts saved instead of the + # number of entries where the token appeared to calculate a relative + # spamminess because we count tokens appearing multiple times not just + # once but as often as they appear in the learned texts + + $rel_ham = $data['count_ham']; + $rel_spam = $data['count_spam']; + + if($texts_ham > 0) + $rel_ham = $data['count_ham'] / $texts_ham; + + if($texts_spam > 0) + $rel_spam = $data['count_spam'] / $texts_spam; + + $rating = $rel_spam / ($rel_ham + $rel_spam); + + # Calculate the better probability proposed by Mr. Robinson + $all = $data['count_ham'] + $data['count_spam']; + return (($this->config['rob_s'] * $this->config['rob_x']) + ($all * $rating)) / ($this->config['rob_s'] + $all); + + } + + /** + * Check the validity of the category of a request + * + * @access private + * @param string $category + * @return void + */ + + private function _check_category($category) + { + return $category === self::HAM or $category === self::SPAM; + } + + /** + * Learn a reference text + * + * @access public + * @param string $text + * @param const $category Either b8::SPAM or b8::HAM + * @return void + */ + + public function learn($text, $category, $uid) + { + return $this->_process_text($text, $category, self::LEARN, $uid); + } + + /** + * Unlearn a reference text + * + * @access public + * @param string $text + * @param const $category Either b8::SPAM or b8::HAM + * @return void + */ + + public function unlearn($text, $category, $uid) + { + return $this->_process_text($text, $category, self::UNLEARN, $uid); + } + + /** + * Does the actual interaction with the storage backend for learning or unlearning texts + * + * @access private + * @param string $text + * @param const $category Either b8::SPAM or b8::HAM + * @param const $action Either b8::LEARN or b8::UNLEARN + * @return void + */ + + private function _process_text($text, $category, $action, $uid = 0) + { + + # Validate the startup + + $started_up = $this->validate(); + + if($started_up !== TRUE) + return $started_up; + + # Look if the request is okay + if($this->_check_category($category) === FALSE) + return self::TRAINER_CATEGORY_FAIL; + + # Get all tokens from $text + + $tokens = $this->_lexer->get_tokens($text); + + # Check if the lexer failed + # (if so, $tokens will be a lexer error code, if not, $tokens will be an array) + if(!is_array($tokens)) + return $tokens; + + # Pass the tokens and what to do with it to the storage backend + return $this->_database->process_text($tokens, $category, $action, $uid); + + } + +} + +?>
\ No newline at end of file |