diff options
Diffstat (limited to 'library/spam/b8/storage/storage_base.php.ORIG')
-rw-r--r-- | library/spam/b8/storage/storage_base.php.ORIG | 395 |
1 files changed, 395 insertions, 0 deletions
diff --git a/library/spam/b8/storage/storage_base.php.ORIG b/library/spam/b8/storage/storage_base.php.ORIG new file mode 100644 index 000000000..01f5a69d7 --- /dev/null +++ b/library/spam/b8/storage/storage_base.php.ORIG @@ -0,0 +1,395 @@ +<?php + +# Copyright (C) 2010 Tobias Leupold <tobias.leupold@web.de> +# +# This file is part of the b8 package +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation in version 2.1 of the License. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +/** + * Functions used by all storage backends + * Copyright (C) 2010 Tobias Leupold <tobias.leupold@web.de> + * + * @license LGPL + * @access public + * @package b8 + * @author Tobias Leupold + */ + +abstract class b8_storage_base +{ + + public $connected = FALSE; + + protected $_degenerator = NULL; + + const INTERNALS_TEXTS_HAM = 'bayes*texts.ham'; + const INTERNALS_TEXTS_SPAM = 'bayes*texts.spam'; + const INTERNALS_DBVERSION = 'bayes*dbversion'; + + const BACKEND_NOT_CONNECTED = 'BACKEND_NOT_CONNECTED'; + const DATABASE_WRONG_VERSION = 'DATABASE_WRONG_VERSION'; + const DATABASE_NOT_B8 = 'DATABASE_NOT_B8'; + + /** + * Validates the class has all it needs to work. + * + * @access protected + * @return mixed Returns TRUE if everything is okay, otherwise an error code. + */ + + protected function validate() + { + + # We set up the degenerator here, as we would have to duplicate code if it + # was done in the constructor of the respective storage backend. + $class = 'b8_degenerator_' . $this->b8_config['degenerator']; + $this->_degenerator = new $class(); + + if($this->connected !== TRUE) + return self::BACKEND_NOT_CONNECTED; + + return TRUE; + + } + + /** + * Checks if a b8 database is used and if it's version is okay + * + * @access protected + * @return mixed Returns TRUE if everything is okay, otherwise an error code. + */ + + protected function check_database() + { + + $internals = $this->get_internals(); + + if(isset($internals['dbversion'])) { + if($internals['dbversion'] == "2") { + return TRUE; + } + else { + $this->connected = FALSE; + return self::DATABASE_WRONG_VERSION; + } + } + else { + $this->connected = FALSE; + return self::DATABASE_NOT_B8; + } + + } + + /** + * Parses the "count" data of a token. + * + * @access private + * @param string $data + * @return array Returns an array of the parsed data: array(count_ham, count_spam, lastseen). + */ + + private function _parse_count($data) + { + + list($count_ham, $count_spam, $lastseen) = explode(' ', $data); + + $count_ham = (int) $count_ham; + $count_spam = (int) $count_spam; + + return array( + 'count_ham' => $count_ham, + 'count_spam' => $count_spam + ); + + } + + /** + * Get the database's internal variables. + * + * @access public + * @return array Returns an array of all internals. + */ + + public function get_internals() + { + + $internals = $this->_get_query( + array( + self::INTERNALS_TEXTS_HAM, + self::INTERNALS_TEXTS_SPAM, + self::INTERNALS_DBVERSION + ) + ); + + return array( + 'texts_ham' => (int) $internals[self::INTERNALS_TEXTS_HAM], + 'texts_spam' => (int) $internals[self::INTERNALS_TEXTS_SPAM], + 'dbversion' => (int) $internals[self::INTERNALS_DBVERSION] + ); + + } + + /** + * Get all data about a list of tags from the database. + * + * @access public + * @param array $tokens + * @return mixed Returns FALSE on failure, otherwise returns array of returned data in the format array('tokens' => array(token => count), 'degenerates' => array(token => array(degenerate => count))). + */ + + public function get($tokens) + { + + # Validate the startup + + $started_up = $this->validate(); + + if($started_up !== TRUE) + return $started_up; + + # First we see what we have in the database. + $token_data = $this->_get_query($tokens); + + # Check if we have to degenerate some tokens + + $missing_tokens = array(); + + foreach($tokens as $token) { + if(!isset($token_data[$token])) + $missing_tokens[] = $token; + } + + if(count($missing_tokens) > 0) { + + # We have to degenerate some tokens + $degenerates_list = array(); + + # Generate a list of degenerated tokens for the missing tokens ... + $degenerates = $this->_degenerator->degenerate($missing_tokens); + + # ... and look them up + + foreach($degenerates as $token => $token_degenerates) + $degenerates_list = array_merge($degenerates_list, $token_degenerates); + + $token_data = array_merge($token_data, $this->_get_query($degenerates_list)); + + } + + # Here, we have all availible data in $token_data. + + $return_data_tokens = array(); + $return_data_degenerates = array(); + + foreach($tokens as $token) { + + if(isset($token_data[$token]) === TRUE) { + + # The token was found in the database + + # Add the data ... + $return_data_tokens[$token] = $this->_parse_count($token_data[$token]); + + # ... and update it's lastseen parameter + $this->_update($token, "{$return_data_tokens[$token]['count_ham']} {$return_data_tokens[$token]['count_spam']} " . $this->b8_config['today']); + + } + + else { + + # The token was not found, so we look if we + # can return data for degenerated tokens + + # Check all degenerated forms of the token + + foreach($this->_degenerator->degenerates[$token] as $degenerate) { + + if(isset($token_data[$degenerate]) === TRUE) { + + # A degeneration of the token way found in the database + + # Add the data ... + $return_data_degenerates[$token][$degenerate] = $this->_parse_count($token_data[$degenerate]); + + # ... and update it's lastseen parameter + $this->_update($degenerate, "{$return_data_degenerates[$token][$degenerate]['count_ham']} {$return_data_degenerates[$token][$degenerate]['count_spam']} " . $this->b8_config['today']); + + } + + } + + } + + } + + # Now, all token data directly found in the database is in $return_data_tokens + # and all data for degenerated versions is in $return_data_degenerates + + # First, we commit the changes to the lastseen parameters + $this->_commit(); + + # Then, we return what we have + return array( + 'tokens' => $return_data_tokens, + 'degenerates' => $return_data_degenerates + ); + + } + + /** + * Stores or deletes a list of tokens from the given category. + * + * @access public + * @param array $tokens + * @param const $category Either b8::HAM or b8::SPAM + * @param const $action Either b8::LEARN or b8::UNLEARN + * @return void + */ + + public function process_text($tokens, $category, $action) + { + + # Validate the startup + + $started_up = $this->validate(); + + if($started_up !== TRUE) + return $started_up; + + # No matter what we do, we first have to check what data we have. + + # First get the internals, including the ham texts and spam texts counter + $internals = $this->get_internals(); + + # Then, fetch all data for all tokens we have (and update their lastseen parameters) + $token_data = $this->_get_query(array_keys($tokens)); + + # Process all tokens to learn/unlearn + + foreach($tokens as $token => $count) { + + if(isset($token_data[$token])) { + + # We already have this token, so update it's data + + # Get the existing data + list($count_ham, $count_spam, $lastseen) = explode(' ', $token_data[$token]); + $count_ham = (int) $count_ham; + $count_spam = (int) $count_spam; + + # Increase or decrease the right counter + + if($action === b8::LEARN) { + if($category === b8::HAM) + $count_ham += $count; + elseif($category === b8::SPAM) + $count_spam += $count; + } + + elseif($action == b8::UNLEARN) { + if($category === b8::HAM) + $count_ham -= $count; + elseif($category === b8::SPAM) + $count_spam -= $count; + } + + # We don't want to have negative values + + if($count_ham < 0) + $count_ham = 0; + + if($count_spam < 0) + $count_spam = 0; + + # Now let's see if we have to update or delete the token + if($count_ham !== 0 or $count_spam !== 0) + $this->_update($token, "$count_ham $count_spam " . $this->b8_config['today']); + else + $this->_del($token); + + } + + else { + + # We don't have the token. If we unlearn a text, we can't delete it + # as we don't have it anyway, so just do something if we learn a text + + if($action === b8::LEARN) { + + if($category === b8::HAM) + $data = '1 0 '; + elseif($category === b8::SPAM) + $data = '0 1 '; + + $data .= $this->b8_config['today']; + + $this->_put($token, $data); + + } + + } + + } + + # Now, all token have been processed, so let's update the right text + + if($action === b8::LEARN) { + + if($category === b8::HAM) { + $internals['texts_ham']++; + $this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham']); + } + + elseif($category === b8::SPAM) { + $internals['texts_spam']++; + $this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam']); + } + + } + + elseif($action == b8::UNLEARN) { + + if($category === b8::HAM) { + + $internals['texts_ham']--; + + if($internals['texts_ham'] < 0) + $internals['texts_ham'] = 0; + + $this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham']); + + } + + elseif($category === b8::SPAM) { + + $internals['texts_spam']--; + + if($internals['texts_spam'] < 0) + $internals['texts_spam'] = 0; + + $this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam']); + + } + + } + + # We're done and can commit all changes to the database now + $this->_commit(); + + } + +} + +?>
\ No newline at end of file |