From c8c062d96024a23e67e73f1ebffcf7009b18eed1 Mon Sep 17 00:00:00 2001 From: friendica Date: Tue, 31 Jan 2012 15:54:41 -0800 Subject: add spam engine --- library/spam/b8/storage/storage_base.php | 395 ++++++++++++++++++++++++++++++ library/spam/b8/storage/storage_dba.php | 198 +++++++++++++++ library/spam/b8/storage/storage_mysql.php | 351 ++++++++++++++++++++++++++ 3 files changed, 944 insertions(+) create mode 100644 library/spam/b8/storage/storage_base.php create mode 100644 library/spam/b8/storage/storage_dba.php create mode 100644 library/spam/b8/storage/storage_mysql.php (limited to 'library/spam/b8/storage') diff --git a/library/spam/b8/storage/storage_base.php b/library/spam/b8/storage/storage_base.php new file mode 100644 index 000000000..01f5a69d7 --- /dev/null +++ b/library/spam/b8/storage/storage_base.php @@ -0,0 +1,395 @@ + +# +# This file is part of the b8 package +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation in version 2.1 of the License. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +/** + * Functions used by all storage backends + * Copyright (C) 2010 Tobias Leupold + * + * @license LGPL + * @access public + * @package b8 + * @author Tobias Leupold + */ + +abstract class b8_storage_base +{ + + public $connected = FALSE; + + protected $_degenerator = NULL; + + const INTERNALS_TEXTS_HAM = 'bayes*texts.ham'; + const INTERNALS_TEXTS_SPAM = 'bayes*texts.spam'; + const INTERNALS_DBVERSION = 'bayes*dbversion'; + + const BACKEND_NOT_CONNECTED = 'BACKEND_NOT_CONNECTED'; + const DATABASE_WRONG_VERSION = 'DATABASE_WRONG_VERSION'; + const DATABASE_NOT_B8 = 'DATABASE_NOT_B8'; + + /** + * Validates the class has all it needs to work. + * + * @access protected + * @return mixed Returns TRUE if everything is okay, otherwise an error code. + */ + + protected function validate() + { + + # We set up the degenerator here, as we would have to duplicate code if it + # was done in the constructor of the respective storage backend. + $class = 'b8_degenerator_' . $this->b8_config['degenerator']; + $this->_degenerator = new $class(); + + if($this->connected !== TRUE) + return self::BACKEND_NOT_CONNECTED; + + return TRUE; + + } + + /** + * Checks if a b8 database is used and if it's version is okay + * + * @access protected + * @return mixed Returns TRUE if everything is okay, otherwise an error code. + */ + + protected function check_database() + { + + $internals = $this->get_internals(); + + if(isset($internals['dbversion'])) { + if($internals['dbversion'] == "2") { + return TRUE; + } + else { + $this->connected = FALSE; + return self::DATABASE_WRONG_VERSION; + } + } + else { + $this->connected = FALSE; + return self::DATABASE_NOT_B8; + } + + } + + /** + * Parses the "count" data of a token. + * + * @access private + * @param string $data + * @return array Returns an array of the parsed data: array(count_ham, count_spam, lastseen). + */ + + private function _parse_count($data) + { + + list($count_ham, $count_spam, $lastseen) = explode(' ', $data); + + $count_ham = (int) $count_ham; + $count_spam = (int) $count_spam; + + return array( + 'count_ham' => $count_ham, + 'count_spam' => $count_spam + ); + + } + + /** + * Get the database's internal variables. + * + * @access public + * @return array Returns an array of all internals. + */ + + public function get_internals() + { + + $internals = $this->_get_query( + array( + self::INTERNALS_TEXTS_HAM, + self::INTERNALS_TEXTS_SPAM, + self::INTERNALS_DBVERSION + ) + ); + + return array( + 'texts_ham' => (int) $internals[self::INTERNALS_TEXTS_HAM], + 'texts_spam' => (int) $internals[self::INTERNALS_TEXTS_SPAM], + 'dbversion' => (int) $internals[self::INTERNALS_DBVERSION] + ); + + } + + /** + * Get all data about a list of tags from the database. + * + * @access public + * @param array $tokens + * @return mixed Returns FALSE on failure, otherwise returns array of returned data in the format array('tokens' => array(token => count), 'degenerates' => array(token => array(degenerate => count))). + */ + + public function get($tokens) + { + + # Validate the startup + + $started_up = $this->validate(); + + if($started_up !== TRUE) + return $started_up; + + # First we see what we have in the database. + $token_data = $this->_get_query($tokens); + + # Check if we have to degenerate some tokens + + $missing_tokens = array(); + + foreach($tokens as $token) { + if(!isset($token_data[$token])) + $missing_tokens[] = $token; + } + + if(count($missing_tokens) > 0) { + + # We have to degenerate some tokens + $degenerates_list = array(); + + # Generate a list of degenerated tokens for the missing tokens ... + $degenerates = $this->_degenerator->degenerate($missing_tokens); + + # ... and look them up + + foreach($degenerates as $token => $token_degenerates) + $degenerates_list = array_merge($degenerates_list, $token_degenerates); + + $token_data = array_merge($token_data, $this->_get_query($degenerates_list)); + + } + + # Here, we have all availible data in $token_data. + + $return_data_tokens = array(); + $return_data_degenerates = array(); + + foreach($tokens as $token) { + + if(isset($token_data[$token]) === TRUE) { + + # The token was found in the database + + # Add the data ... + $return_data_tokens[$token] = $this->_parse_count($token_data[$token]); + + # ... and update it's lastseen parameter + $this->_update($token, "{$return_data_tokens[$token]['count_ham']} {$return_data_tokens[$token]['count_spam']} " . $this->b8_config['today']); + + } + + else { + + # The token was not found, so we look if we + # can return data for degenerated tokens + + # Check all degenerated forms of the token + + foreach($this->_degenerator->degenerates[$token] as $degenerate) { + + if(isset($token_data[$degenerate]) === TRUE) { + + # A degeneration of the token way found in the database + + # Add the data ... + $return_data_degenerates[$token][$degenerate] = $this->_parse_count($token_data[$degenerate]); + + # ... and update it's lastseen parameter + $this->_update($degenerate, "{$return_data_degenerates[$token][$degenerate]['count_ham']} {$return_data_degenerates[$token][$degenerate]['count_spam']} " . $this->b8_config['today']); + + } + + } + + } + + } + + # Now, all token data directly found in the database is in $return_data_tokens + # and all data for degenerated versions is in $return_data_degenerates + + # First, we commit the changes to the lastseen parameters + $this->_commit(); + + # Then, we return what we have + return array( + 'tokens' => $return_data_tokens, + 'degenerates' => $return_data_degenerates + ); + + } + + /** + * Stores or deletes a list of tokens from the given category. + * + * @access public + * @param array $tokens + * @param const $category Either b8::HAM or b8::SPAM + * @param const $action Either b8::LEARN or b8::UNLEARN + * @return void + */ + + public function process_text($tokens, $category, $action) + { + + # Validate the startup + + $started_up = $this->validate(); + + if($started_up !== TRUE) + return $started_up; + + # No matter what we do, we first have to check what data we have. + + # First get the internals, including the ham texts and spam texts counter + $internals = $this->get_internals(); + + # Then, fetch all data for all tokens we have (and update their lastseen parameters) + $token_data = $this->_get_query(array_keys($tokens)); + + # Process all tokens to learn/unlearn + + foreach($tokens as $token => $count) { + + if(isset($token_data[$token])) { + + # We already have this token, so update it's data + + # Get the existing data + list($count_ham, $count_spam, $lastseen) = explode(' ', $token_data[$token]); + $count_ham = (int) $count_ham; + $count_spam = (int) $count_spam; + + # Increase or decrease the right counter + + if($action === b8::LEARN) { + if($category === b8::HAM) + $count_ham += $count; + elseif($category === b8::SPAM) + $count_spam += $count; + } + + elseif($action == b8::UNLEARN) { + if($category === b8::HAM) + $count_ham -= $count; + elseif($category === b8::SPAM) + $count_spam -= $count; + } + + # We don't want to have negative values + + if($count_ham < 0) + $count_ham = 0; + + if($count_spam < 0) + $count_spam = 0; + + # Now let's see if we have to update or delete the token + if($count_ham !== 0 or $count_spam !== 0) + $this->_update($token, "$count_ham $count_spam " . $this->b8_config['today']); + else + $this->_del($token); + + } + + else { + + # We don't have the token. If we unlearn a text, we can't delete it + # as we don't have it anyway, so just do something if we learn a text + + if($action === b8::LEARN) { + + if($category === b8::HAM) + $data = '1 0 '; + elseif($category === b8::SPAM) + $data = '0 1 '; + + $data .= $this->b8_config['today']; + + $this->_put($token, $data); + + } + + } + + } + + # Now, all token have been processed, so let's update the right text + + if($action === b8::LEARN) { + + if($category === b8::HAM) { + $internals['texts_ham']++; + $this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham']); + } + + elseif($category === b8::SPAM) { + $internals['texts_spam']++; + $this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam']); + } + + } + + elseif($action == b8::UNLEARN) { + + if($category === b8::HAM) { + + $internals['texts_ham']--; + + if($internals['texts_ham'] < 0) + $internals['texts_ham'] = 0; + + $this->_update(self::INTERNALS_TEXTS_HAM, $internals['texts_ham']); + + } + + elseif($category === b8::SPAM) { + + $internals['texts_spam']--; + + if($internals['texts_spam'] < 0) + $internals['texts_spam'] = 0; + + $this->_update(self::INTERNALS_TEXTS_SPAM, $internals['texts_spam']); + + } + + } + + # We're done and can commit all changes to the database now + $this->_commit(); + + } + +} + +?> \ No newline at end of file diff --git a/library/spam/b8/storage/storage_dba.php b/library/spam/b8/storage/storage_dba.php new file mode 100644 index 000000000..04618b23e --- /dev/null +++ b/library/spam/b8/storage/storage_dba.php @@ -0,0 +1,198 @@ + +# +# This file is part of the b8 package +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation in version 2.1 of the License. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +/** + * The DBA (Berkeley DB) abstraction layer for communicating with the database. + * Copyright (C) 2006-2010 Tobias Leupold + * + * @license LGPL + * @access public + * @package b8 + * @author Tobias Leupold + */ + +class b8_storage_dba extends b8_storage_base +{ + + public $config = array( + 'database' => 'wordlist.db', + 'handler' => 'db4', + ); + + public $b8_config = array( + 'degenerator' => NULL, + 'today' => NULL + ); + + private $_db = NULL; + + const DATABASE_CONNECTION_FAIL = 'DATABASE_CONNECTION_FAIL'; + + /** + * Constructs the database layer. + * + * @access public + * @param string $config + */ + + function __construct($config, $degenerator, $today) + { + + # Pass some variables of the main b8 config to this class + $this->b8_config['degenerator'] = $degenerator; + $this->b8_config['today'] = $today; + + # Validate the config items + if(count($config) > 0) { + foreach ($config as $name => $value) { + $this->config[$name] = (string) $value; + } + } + + } + + /** + * Closes the database connection. + * + * @access public + * @return void + */ + + function __destruct() + { + if($this->_db !== NULL) { + dba_close($this->_db); + $this->connected = FALSE; + } + } + + /** + * Connect to the database and do some checks. + * + * @access public + * @return mixed Returns TRUE on a successful database connection, otherwise returns a constant from b8. + */ + + public function connect() + { + + # Have we already connected? + if($this->_db !== NULL) + return TRUE; + + # Open the database connection + $this->_db = dba_open(dirname(__FILE__) . DIRECTORY_SEPARATOR . ".." . DIRECTORY_SEPARATOR . $this->config['database'], "w", $this->config['handler']); + + if($this->_db === FALSE) { + $this->connected = FALSE; + $this->_db = NULL; + return self::DATABASE_CONNECTION_FAIL; + } + + # Everything is okay and connected + + $this->connected = TRUE; + + # Let's see if this is a b8 database and the version is okay + return $this->check_database(); + + } + + /** + * Does the actual interaction with the database when fetching data. + * + * @access protected + * @param array $tokens + * @return mixed Returns an array of the returned data in the format array(token => data) or an empty array if there was no data. + */ + + protected function _get_query($tokens) + { + + $data = array(); + + foreach ($tokens as $token) { + + $count = dba_fetch($token, $this->_db); + + if($count !== FALSE) + $data[$token] = $count; + + } + + return $data; + + } + + /** + * Store a token to the database. + * + * @access protected + * @param string $token + * @param string $count + * @return bool TRUE on success or FALSE on failure + */ + + protected function _put($token, $count) { + return dba_insert($token, $count, $this->_db); + } + + /** + * Update an existing token. + * + * @access protected + * @param string $token + * @param string $count + * @return bool TRUE on success or FALSE on failure + */ + + protected function _update($token, $count) + { + return dba_replace($token, $count, $this->_db); + } + + /** + * Remove a token from the database. + * + * @access protected + * @param string $token + * @return bool TRUE on success or FALSE on failure + */ + + protected function _del($token) + { + return dba_delete($token, $this->_db); + } + + /** + * Does nothing :-D + * + * @access protected + * @return void + */ + + protected function _commit() + { + # We just need this function because the (My)SQL backend(s) need it. + return; + } + +} + +?> \ No newline at end of file diff --git a/library/spam/b8/storage/storage_mysql.php b/library/spam/b8/storage/storage_mysql.php new file mode 100644 index 000000000..022536350 --- /dev/null +++ b/library/spam/b8/storage/storage_mysql.php @@ -0,0 +1,351 @@ + +# +# This file is part of the b8 package +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation in version 2.1 of the License. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +# License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +/** + * The MySQL abstraction layer for communicating with the database. + * Copyright (C) 2009 Oliver Lillie (aka buggedcom) + * Copyright (C) 2010-2011 Tobias Leupold + * + * @license LGPL + * @access public + * @package b8 + * @author Oliver Lillie (aka buggedcom) (original PHP 5 port and optimizations) + * @author Tobias Leupold + */ + +class b8_storage_mysql extends b8_storage_base +{ + + public $config = array( + 'database' => 'b8_wordlist', + 'table_name' => 'b8_wordlist', + 'host' => 'localhost', + 'user' => FALSE, + 'pass' => FALSE, + 'connection' => NULL + ); + + public $b8_config = array( + 'degenerator' => NULL, + 'today' => NULL + ); + + private $_connection = NULL; + private $_deletes = array(); + private $_puts = array(); + private $_updates = array(); + + const DATABASE_CONNECTION_FAIL = 'DATABASE_CONNECTION_FAIL'; + const DATABASE_CONNECTION_ERROR = 'DATABASE_CONNECTION_ERROR'; + const DATABASE_CONNECTION_BAD_RESOURCE = 'DATABASE_CONNECTION_BAD_RESOURCE'; + const DATABASE_SELECT_ERROR = 'DATABASE_SELECT_ERROR'; + const DATABASE_TABLE_ACCESS_FAIL = 'DATABASE_TABLE_ACCESS_FAIL'; + const DATABASE_WRONG_VERSION = 'DATABASE_WRONG_VERSION'; + + /** + * Constructs the database layer. + * + * @access public + * @param string $config + */ + + function __construct($config, $degenerator, $today) + { + + # Pass some variables of the main b8 config to this class + $this->b8_config['degenerator'] = $degenerator; + $this->b8_config['today'] = $today; + + # Validate the config items + + if(count($config) > 0) { + + foreach ($config as $name => $value) { + + switch($name) { + + case 'table_name': + case 'host': + case 'user': + case 'pass': + case 'database': + $this->config[$name] = (string) $value; + break; + + case 'connection': + + if($value !== NULL) { + + if(is_resource($value) === TRUE) { + $resource_type = get_resource_type($value); + $this->config['connection'] = $resource_type !== 'mysql link' && $resource_type !== 'mysql link persistent' ? FALSE : $value; + } + + else + $this->config['connection'] = FALSE; + + } + + break; + + } + + } + + } + + } + + /** + * Closes the database connection. + * + * @access public + * @return void + */ + + function __destruct() + { + + if($this->_connection === NULL) + return; + + # Commit any changes before closing + $this->_commit(); + + # Just close the connection if no link-resource was passed and b8 created it's own connection + if($this->config['connection'] === NULL) + mysql_close($this->_connection); + + $this->connected = FALSE; + + } + + /** + * Connect to the database and do some checks. + * + * @access public + * @return mixed Returns TRUE on a successful database connection, otherwise returns a constant from b8. + */ + + public function connect() + { + + # Are we already connected? + if($this->connected === TRUE) + return TRUE; + + # Are we using an existing passed resource? + if($this->config['connection'] === FALSE) { + # ... yes we are, but the connection is not a resource, so return an error + $this->connected = FALSE; + return self::DATABASE_CONNECTION_BAD_RESOURCE; + } + + elseif($this->config['connection'] === NULL) { + + # ... no we aren't so we have to connect. + + if($this->_connection = mysql_connect($this->config['host'], $this->config['user'], $this->config['pass'])) { + if(mysql_select_db($this->config['database'], $this->_connection) === FALSE) { + $this->connected = FALSE; + return self::DATABASE_SELECT_ERROR . ": " . mysql_error(); + } + } + else { + $this->connected = FALSE; + return self::DATABASE_CONNECTION_ERROR; + } + + } + + else { + # ... yes we are + $this->_connection = $this->config['connection']; + } + + # Just in case ... + if($this->_connection === NULL) { + $this->connected = FALSE; + return self::DATABASE_CONNECTION_FAIL; + } + + # Check to see if the wordlist table exists + if(mysql_query('DESCRIBE ' . $this->config['table_name'], $this->_connection) === FALSE) { + $this->connected = FALSE; + return self::DATABASE_TABLE_ACCESS_FAIL . ": " . mysql_error(); + } + + # Everything is okay and connected + $this->connected = TRUE; + + # Let's see if this is a b8 database and the version is okay + return $this->check_database(); + + } + + /** + * Does the actual interaction with the database when fetching data. + * + * @access protected + * @param array $tokens + * @return mixed Returns an array of the returned data in the format array(token => data) or an empty array if there was no data. + */ + + protected function _get_query($tokens) + { + + # Construct the query ... + + if(count($tokens) > 0) { + + $where = array(); + + foreach ($tokens as $token) { + $token = mysql_real_escape_string($token, $this->_connection); + array_push($where, $token); + } + + $where = 'token IN ("' . implode('", "', $where) . '")'; + } + + else { + $token = mysql_real_escape_string($token, $this->_connection); + $where = 'token = "' . $token . '"'; + } + + # ... and fetch the data + + $result = mysql_query(' + SELECT token, count + FROM ' . $this->config['table_name'] . ' + WHERE ' . $where . '; + ', $this->_connection); + + $data = array(); + + while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) + $data[$row['token']] = $row['count']; + + mysql_free_result($result); + + return $data; + + } + + /** + * Store a token to the database. + * + * @access protected + * @param string $token + * @param string $count + * @return void + */ + + protected function _put($token, $count) { + $token = mysql_real_escape_string($token, $this->_connection); + $count = mysql_real_escape_string($count, $this->_connection);; + array_push($this->_puts, '("' . $token . '", "' . $count . '")'); + } + + /** + * Update an existing token. + * + * @access protected + * @param string $token + * @param string $count + * @return void + */ + + protected function _update($token, $count) + { + $token = mysql_real_escape_string($token, $this->_connection); + $count = mysql_real_escape_string($count, $this->_connection); + array_push($this->_updates, '("' . $token . '", "' . $count . '")'); + } + + /** + * Remove a token from the database. + * + * @access protected + * @param string $token + * @return void + */ + + protected function _del($token) + { + $token = mysql_real_escape_string($token, $this->_connection); + array_push($this->_deletes, $token); + } + + /** + * Commits any modification queries. + * + * @access protected + * @return void + */ + + protected function _commit() + { + + if(count($this->_deletes) > 0) { + + $result = mysql_query(' + DELETE FROM ' . $this->config['table_name'] . ' + WHERE token IN ("' . implode('", "', $this->_deletes) . '"); + ', $this->_connection); + + if(is_resource($result) === TRUE) + mysql_free_result($result); + + $this->_deletes = array(); + + } + + if(count($this->_puts) > 0) { + + $result = mysql_query(' + INSERT INTO ' . $this->config['table_name'] . '(token, count) + VALUES ' . implode(', ', $this->_puts) . ';', $this->_connection); + + if(is_resource($result) === TRUE) + mysql_free_result($result); + + $this->_puts = array(); + + } + + if(count($this->_updates) > 0) { + + $result = mysql_query(' + INSERT INTO ' . $this->config['table_name'] . '(token, count) + VALUES ' . implode(', ', $this->_updates) . ' + ON DUPLICATE KEY UPDATE ' . $this->config['table_name'] . '.count = VALUES(count);', $this->_connection); + + if(is_resource($result) === TRUE) + mysql_free_result($result); + + $this->_updates = array(); + + } + + } + +} + +?> \ No newline at end of file -- cgit v1.2.3