diff -ur Mail-SpamAssassin-3.2.4.orig/lib/Mail/SpamAssassin/BayesStore/MySQL.pm Mail-SpamAssassin-3.2.4/lib/Mail/SpamAssassin/BayesStore/MySQL.pm --- Mail-SpamAssassin-3.2.4.orig/lib/Mail/SpamAssassin/BayesStore/MySQL.pm 2008-01-05 22:10:35.000000000 +0100 +++ Mail-SpamAssassin-3.2.4/lib/Mail/SpamAssassin/BayesStore/MySQL.pm 2008-12-21 20:29:57.265157170 +0100 @@ -75,7 +75,7 @@ my $too_old = $vars[10] - $newdelta; # tooold = newest - delta # if token atime > newest, reset to newest ... - my $sql = "UPDATE bayes_token SET atime = ? + my $sql = "UPDATE $$self{_token_table} SET atime = ? WHERE id = ? AND atime > ?"; @@ -89,7 +89,7 @@ } # Check to make sure the expire won't remove too many tokens - $sql = "SELECT count(token) FROM bayes_token + $sql = "SELECT count(token) FROM $$self{_token_table} WHERE id = ? AND atime < ?"; @@ -124,7 +124,7 @@ } else { # Do the expire - $sql = "DELETE from bayes_token + $sql = "DELETE from $$self{_token_table} WHERE id = ? AND atime < ?"; @@ -146,7 +146,7 @@ last_atime_delta = ?, last_expire_reduce = ?, oldest_token_age = (SELECT min(atime) - FROM bayes_token + FROM $$self{_token_table} WHERE id = ?) WHERE id = ?"; @@ -415,7 +415,7 @@ # shortcut, will only update atime for the token if the atime is less than # what we are updating to - my $sql = "UPDATE bayes_token + my $sql = "UPDATE $$self{_token_table} SET atime = ? WHERE id = ? AND token = ? @@ -477,7 +477,7 @@ return 1 unless (scalar(@{$tokens})); - my $sql = "UPDATE bayes_token SET atime = ? WHERE id = ? AND token IN ("; + my $sql = "UPDATE $$self{_token_table} SET atime = ? WHERE id = ? AND token IN ("; my @bindings = ($atime, $self->{_userid}); foreach my $token (@{$tokens}) { @@ -538,7 +538,7 @@ # cleanup was needed, go ahead and clear the cleanup flag $self->{needs_cleanup} = 0; - my $sql = "DELETE from bayes_token + my $sql = "DELETE from $$self{_token_table} WHERE id = ? AND spam_count <= 0 AND ham_count <= 0"; @@ -616,7 +616,7 @@ return 0; } - $rows = $self->{_dbh}->do("DELETE FROM bayes_token WHERE id = ?", + $rows = $self->{_dbh}->do("DELETE FROM $$self{_token_table} WHERE id = ?", undef, $self->{_userid}); unless (defined($rows)) { @@ -785,7 +785,7 @@ # counts may have both reached 0 $self->{needs_cleanup} = 1; - my $sql = "UPDATE bayes_token SET spam_count = GREATEST(spam_count + ?, 0), + my $sql = "UPDATE $$self{_token_table} SET spam_count = GREATEST(spam_count + ?, 0), ham_count = GREATEST(ham_count + ?, 0) WHERE id = ? AND token = ?"; @@ -810,7 +810,7 @@ } } else { - my $sql = "INSERT INTO bayes_token + my $sql = "INSERT INTO $$self{_token_table} (id, token, spam_count, ham_count, atime) VALUES (?,?,?,?,?) ON DUPLICATE KEY UPDATE spam_count = GREATEST(spam_count + ?, 0), @@ -918,7 +918,7 @@ # counts may have both reached 0 $self->{needs_cleanup} = 1; - my $sql = "UPDATE bayes_token SET spam_count = GREATEST(spam_count + ?, 0), + my $sql = "UPDATE $$self{_token_table} SET spam_count = GREATEST(spam_count + ?, 0), ham_count = GREATEST(ham_count + ?, 0) WHERE id = ? AND token = ?"; @@ -952,7 +952,7 @@ } } else { - my $sql = "INSERT INTO bayes_token + my $sql = "INSERT INTO $$self{_token_table} (id, token, spam_count, ham_count, atime) VALUES (?,?,?,?,?) ON DUPLICATE KEY UPDATE spam_count = GREATEST(spam_count + ?, 0), diff -ur Mail-SpamAssassin-3.2.4.orig/lib/Mail/SpamAssassin/BayesStore/PgSQL.pm Mail-SpamAssassin-3.2.4/lib/Mail/SpamAssassin/BayesStore/PgSQL.pm --- Mail-SpamAssassin-3.2.4.orig/lib/Mail/SpamAssassin/BayesStore/PgSQL.pm 2008-01-05 22:10:35.000000000 +0100 +++ Mail-SpamAssassin-3.2.4/lib/Mail/SpamAssassin/BayesStore/PgSQL.pm 2008-12-21 20:29:57.275157321 +0100 @@ -77,7 +77,7 @@ my $too_old = $vars[10] - $newdelta; # tooold = newest - delta # if token atime > newest, reset to newest ... - my $sql = "UPDATE bayes_token SET atime = ? + my $sql = "UPDATE $$self{_token_table} SET atime = ? WHERE id = ? AND atime > ?"; @@ -91,7 +91,7 @@ } # Check to make sure the expire won't remove too many tokens - $sql = "SELECT count(token) FROM bayes_token + $sql = "SELECT count(token) FROM $$self{_token_table} WHERE id = ? AND atime < ?"; @@ -126,7 +126,7 @@ } else { # Do the expire - $sql = "DELETE from bayes_token + $sql = "DELETE from $$self{_token_table} WHERE id = ? AND atime < ?"; @@ -148,7 +148,7 @@ last_atime_delta = ?, last_expire_reduce = ?, oldest_token_age = (SELECT min(atime) - FROM bayes_token + FROM $$self{_token_table} WHERE id = ?) WHERE id = ?"; @@ -358,7 +358,7 @@ return (0,0,0) unless (defined($self->{_dbh})); my $sql = "SELECT spam_count, ham_count, atime - FROM bayes_token + FROM $$self{_token_table} WHERE id = ? AND token = ?"; @@ -415,7 +415,7 @@ my $bunch_end; my $multi_sql = "SELECT token, spam_count, ham_count, atime - FROM bayes_token + FROM $$self{_token_table} WHERE id = ? AND token IN "; @@ -558,7 +558,7 @@ # shortcut, will only update atime for the token if the atime is less than # what we are updating to - my $sql = "UPDATE bayes_token + my $sql = "UPDATE $$self{_token_table} SET atime = ? WHERE id = ? AND token = ? @@ -644,7 +644,7 @@ return 1 unless (scalar(@{$tokens})); - my $sql = "UPDATE bayes_token SET atime = ? WHERE id = ? AND token IN ("; + my $sql = "UPDATE $$self{_token_table} SET atime = ? WHERE id = ? AND token IN ("; my @bindings; foreach my $token (sort @{$tokens}) { @@ -770,7 +770,7 @@ # cleanup was needed, go ahead and clear the cleanup flag $self->{needs_cleanup} = 0; - my $sql = "DELETE from bayes_token + my $sql = "DELETE from $$self{_token_table} WHERE id = ? AND spam_count <= 0 AND ham_count <= 0"; @@ -849,7 +849,7 @@ return 0; } - $rows = $self->{_dbh}->do("DELETE FROM bayes_token WHERE id = ?", + $rows = $self->{_dbh}->do("DELETE FROM $$self{_token_table} WHERE id = ?", undef, $self->{_userid}); unless (defined($rows)) { diff -ur Mail-SpamAssassin-3.2.4.orig/lib/Mail/SpamAssassin/BayesStore/SQL.pm Mail-SpamAssassin-3.2.4/lib/Mail/SpamAssassin/BayesStore/SQL.pm --- Mail-SpamAssassin-3.2.4.orig/lib/Mail/SpamAssassin/BayesStore/SQL.pm 2008-01-05 22:10:35.000000000 +0100 +++ Mail-SpamAssassin-3.2.4/lib/Mail/SpamAssassin/BayesStore/SQL.pm 2008-12-21 20:29:57.275157321 +0100 @@ -36,6 +36,7 @@ use Mail::SpamAssassin::BayesStore; use Mail::SpamAssassin::Logger; use Digest::SHA1 qw(sha1); +use String::CRC32; use vars qw( @ISA ); @@ -238,7 +239,7 @@ return %delta unless (defined($self->{_dbh})); my $sql = "SELECT count(*) - FROM bayes_token + FROM $$self{_token_table} WHERE id = ? AND atime < ?"; @@ -290,7 +291,7 @@ my $too_old = $vars[10] - $newdelta; # tooold = newest - delta # if token atime > newest, reset to newest ... - my $sql = "UPDATE bayes_token SET atime = ? + my $sql = "UPDATE $$self{_token_table} SET atime = ? WHERE id = ? AND atime > ?"; @@ -303,7 +304,7 @@ } # Check to make sure the expire won't remove too many tokens - $sql = "SELECT count(token) FROM bayes_token + $sql = "SELECT count(token) FROM $$self{_token_table} WHERE id = ? AND atime < ?"; @@ -336,7 +337,7 @@ } else { # Do the expire - $sql = "DELETE from bayes_token + $sql = "DELETE from $$self{_token_table} WHERE id = ? AND atime < ?"; @@ -628,7 +629,7 @@ my $token_select = $self->_token_select_string(); my $sql = "SELECT $token_select, spam_count, ham_count, atime - FROM bayes_token + FROM $$self{_token_table} WHERE id = ? AND (spam_count > 0 OR ham_count > 0)"; @@ -806,7 +807,7 @@ return (0,0,0) unless (defined($self->{_dbh})); my $sql = "SELECT spam_count, ham_count, atime - FROM bayes_token + FROM $$self{_token_table} WHERE id = ? AND token = ?"; @@ -861,7 +862,7 @@ my $token_select = $self->_token_select_string(); my $multi_sql = "SELECT $token_select, spam_count, ham_count, atime - FROM bayes_token + FROM $$self{_token_table} WHERE id = ? AND token IN "; @@ -1058,7 +1059,7 @@ # shortcut, will only update atime for the token if the atime is less than # what we are updating to - my $sql = "UPDATE bayes_token + my $sql = "UPDATE $$self{_token_table} SET atime = ? WHERE id = ? AND token = ? @@ -1117,7 +1118,7 @@ return 1 unless (scalar(@{$tokens})); - my $sql = "UPDATE bayes_token SET atime = ? WHERE id = ? AND token IN ("; + my $sql = "UPDATE $$self{_token_table} SET atime = ? WHERE id = ? AND token IN ("; my @bindings = ($atime, $self->{_userid}); foreach my $token (@{$tokens}) { @@ -1176,7 +1177,7 @@ # cleanup was needed, go ahead and clear the cleanup flag $self->{needs_cleanup} = 0; - my $sql = "DELETE from bayes_token + my $sql = "DELETE from $$self{_token_table} WHERE id = ? AND spam_count = 0 AND ham_count = 0"; @@ -1299,7 +1300,7 @@ return 0; } - $rows = $self->{_dbh}->do("DELETE FROM bayes_token WHERE id = ?", + $rows = $self->{_dbh}->do("DELETE FROM $$self{_token_table} WHERE id = ?", undef, $self->{_userid}); unless (defined($rows)) { @@ -1338,7 +1339,7 @@ my $token_select = $self->_token_select_string(); my $token_sql = "SELECT spam_count, ham_count, atime, $token_select - FROM bayes_token + FROM $$self{_token_table} WHERE id = ? AND (spam_count > 0 OR ham_count > 0)"; @@ -1745,7 +1746,7 @@ } } - my $sqlselect = "SELECT id FROM bayes_vars WHERE username = ?"; + my $sqlselect = "SELECT id, token_table FROM bayes_vars WHERE username = ?"; my $sthselect = $self->{_dbh}->prepare_cached($sqlselect); @@ -1761,24 +1762,32 @@ return 0; } - my ($id) = $sthselect->fetchrow_array(); + my ($id, $token_table) = $sthselect->fetchrow_array(); if ($id) { $self->{_userid} = $id; - dbg("bayes: Using userid: ".$self->{_userid}); + $self->{_token_table} = $token_table; + dbg("bayes: Using userid: ".$self->{_userid}.", token table: ".$self->{_token_table}); $sthselect->finish(); return 1; } + $self->{_token_table} = "bayes_token"; # Do not create an entry for this user unless we were specifically asked to return 0 unless ($create_entry_p); # For now let the database setup the other variables as defaults - my $sqlinsert = "INSERT INTO bayes_vars (username) VALUES (?)"; + my $sqlinsert = "INSERT INTO bayes_vars (username, token_table) VALUES (?, ?)"; + if ($self->{bayes}->{conf}->{bayes_sql_token_table_count} + && $self->{bayes}->{conf}->{bayes_sql_token_table_count} > 1) { + $self->{_token_table} = "bayes_token_" + . (crc32($self->{_username}) % $self->{bayes}->{conf}->{token_table_count}); + } my $rows = $self->{_dbh}->do($sqlinsert, undef, - $self->{_username}); + $self->{_username}, + $self->{_token_table}); unless (defined($rows)) { dbg("bayes: _initialize_db: SQL error: ".$self->{_dbh}->errstr()); return 0; @@ -1843,7 +1852,7 @@ # if we are unable to find an entry. return 1 if ($spam_count < 0 || $ham_count < 0); - my $sql = "INSERT INTO bayes_token + my $sql = "INSERT INTO $$self{_token_table} (id, token, spam_count, ham_count, atime) VALUES (?,?,?,?,?)"; @@ -1930,7 +1939,7 @@ my $sql; my @args; if ($update_atime_p) { - $sql = "UPDATE bayes_token + $sql = "UPDATE $$self{_token_table} SET spam_count = spam_count + ?, atime = ? WHERE id = ? @@ -1940,7 +1949,7 @@ $updated_atime_p = 1; # note the fact that we did do it } else { - $sql = "UPDATE bayes_token + $sql = "UPDATE $$self{_token_table} SET spam_count = spam_count + ? WHERE id = ? AND token = ? @@ -1960,7 +1969,7 @@ my $sql; my @args; if ($update_atime_p && !$updated_atime_p) { - $sql = "UPDATE bayes_token + $sql = "UPDATE $$self{_token_table} SET ham_count = ham_count + ?, atime = ? WHERE id = ? @@ -1970,7 +1979,7 @@ $updated_atime_p = 1; # note the fact that we did do it } else { - $sql = "UPDATE bayes_token + $sql = "UPDATE $$self{_token_table} SET ham_count = ham_count + ? WHERE id = ? AND token = ? @@ -2034,7 +2043,7 @@ my $atime_inserted_p = 0; my $new_tokens = 0; - my $insertsql = "INSERT INTO bayes_token + my $insertsql = "INSERT INTO $$self{_token_table} (id, token, spam_count, ham_count, atime) VALUES (?,?,?,?,?)"; @@ -2101,7 +2110,7 @@ my $sql; my @args; if ($update_atime_p) { - $sql = "UPDATE bayes_token + $sql = "UPDATE $$self{_token_table} SET spam_count = spam_count + ?, atime = ? WHERE id = ? @@ -2111,7 +2120,7 @@ $atime_updated_p = 1; } else { - $sql = "UPDATE bayes_token + $sql = "UPDATE $$self{_token_table} SET spam_count = spam_count + ? WHERE id = ? AND token = ? @@ -2131,7 +2140,7 @@ my @args; # if $spam_count then we already updated the atime if ($update_atime_p && !$spam_count) { - $sql = "UPDATE bayes_token + $sql = "UPDATE $$self{_token_table} SET ham_count = ham_count + ?, atime = ? WHERE id = ? @@ -2141,7 +2150,7 @@ $atime_updated_p = 1; } else { - $sql = "UPDATE bayes_token + $sql = "UPDATE $$self{_token_table} SET ham_count = ham_count + ? WHERE id = ? AND token = ? @@ -2219,7 +2228,7 @@ return 0 unless (defined($self->{_dbh})); - my $sql = "SELECT min(atime) FROM bayes_token + my $sql = "SELECT min(atime) FROM $$self{_token_table} WHERE id = ?"; my $sth = $self->{_dbh}->prepare_cached($sql); @@ -2260,7 +2269,7 @@ return 0 unless (defined($self->{_dbh})); my $sql = "SELECT count(*) - FROM bayes_token + FROM $$self{_token_table} WHERE id = ? AND spam_count + ham_count = 1"; @@ -2302,7 +2311,7 @@ return 0 unless (defined($self->{_dbh})); my $sql = "SELECT count(*) - FROM bayes_token + FROM $$self{_token_table} WHERE id = ? AND (spam_count >= 0 AND spam_count < 8) AND (ham_count >= 0 AND ham_count < 8) diff -ur Mail-SpamAssassin-3.2.4.orig/lib/Mail/SpamAssassin/Conf.pm Mail-SpamAssassin-3.2.4/lib/Mail/SpamAssassin/Conf.pm --- Mail-SpamAssassin-3.2.4.orig/lib/Mail/SpamAssassin/Conf.pm 2008-01-05 22:11:03.000000000 +0100 +++ Mail-SpamAssassin-3.2.4/lib/Mail/SpamAssassin/Conf.pm 2008-12-21 20:29:57.275157321 +0100 @@ -2683,6 +2683,27 @@ type => $CONF_TYPE_BOOL }); +=item bayes_sql_token_table_count + +Used by BayesStore::SQL storage implementation. + +With this option you can spread the token information over several tables if +the single table will otherwise get too large. + +After settings this to a value > 1 you must create the tables +bayes_token_0 through bayes_token_[number - 1] by yourself! They +have the same structure as the original bayes_token table. + +=cut + + push (@cmds, { + setting => 'bayes_sql_token_table_count', + is_admin => 1, + default => 1, + type => $CONF_TYPE_NUMERIC + }); + + =item user_scores_dsn DBI:databasetype:databasename:hostname:port If you load user scores from an SQL database, this will set the DSN diff -ur Mail-SpamAssassin-3.2.4.orig/sql/README.bayes Mail-SpamAssassin-3.2.4/sql/README.bayes --- Mail-SpamAssassin-3.2.4.orig/sql/README.bayes 2008-01-05 22:12:32.000000000 +0100 +++ Mail-SpamAssassin-3.2.4/sql/README.bayes 2008-12-21 20:30:16.305444389 +0100 @@ -66,6 +66,15 @@ share bayesian filter data. You can also use this config option to trick sa-learn to learn data as a specific user. +If your token table will get really large you may spread the bayes +tokens over several tables. You can set this in the config file with + +bayes_sql_token_table_count number + +After settings this to a value > 1 you must create the tables +bayes_token_0 through bayes_token_[number - 1] by yourself! They +have the same structure as the original bayes_token table. + Requirements ------------ diff -ur Mail-SpamAssassin-3.2.4.orig/sql/bayes_mysql.sql Mail-SpamAssassin-3.2.4/sql/bayes_mysql.sql --- Mail-SpamAssassin-3.2.4.orig/sql/bayes_mysql.sql 2008-01-05 22:12:32.000000000 +0100 +++ Mail-SpamAssassin-3.2.4/sql/bayes_mysql.sql 2008-12-21 20:30:16.335444841 +0100 @@ -42,6 +42,7 @@ last_expire_reduce int(11) NOT NULL default '0', oldest_token_age int(11) NOT NULL default '2147483647', newest_token_age int(11) NOT NULL default '0', + token_table varchar(20) NOT NULL default 'bayes_token', PRIMARY KEY (id), UNIQUE bayes_vars_idx1 (username) ) TYPE=MyISAM; diff -ur Mail-SpamAssassin-3.2.4.orig/sql/bayes_pg.sql Mail-SpamAssassin-3.2.4/sql/bayes_pg.sql --- Mail-SpamAssassin-3.2.4.orig/sql/bayes_pg.sql 2008-01-05 22:12:32.000000000 +0100 +++ Mail-SpamAssassin-3.2.4/sql/bayes_pg.sql 2008-12-21 20:30:16.345444992 +0100 @@ -43,6 +43,7 @@ last_expire_reduce integer NOT NULL default '0', oldest_token_age integer NOT NULL default '2147483647', newest_token_age integer NOT NULL default '0', + token_table varchar(20) NOT NULL default 'bayes_token', PRIMARY KEY (id) ) WITHOUT OIDS; @@ -65,25 +66,29 @@ inspam_count INTEGER, inham_count INTEGER, inatime INTEGER) -RETURNS VOID AS ' +RETURNS VOID AS $$ DECLARE _token BYTEA; new_tokens INTEGER := 0; + tt VARCHAR(20); BEGIN + SELECT token_table INTO tt FROM bayes_vars WHERE (id = inuserid); + for i in array_lower(intokenary, 1) .. array_upper(intokenary, 1) LOOP _token := intokenary[i]; - UPDATE bayes_token - SET spam_count = greatest_int(spam_count + inspam_count, 0), - ham_count = greatest_int(ham_count + inham_count, 0), - atime = greatest_int(atime, inatime) - WHERE id = inuserid - AND token = _token; + EXECUTE 'UPDATE ' || tt || ' + SET spam_count = greatest_int(spam_count + ' || inspam_count || ', 0), + ham_count = greatest_int(ham_count + ' || inham_count || ', 0), + atime = greatest_int(atime, ' || inatime || ') + WHERE id = inuserid + AND token = ' || quote_literal(token) || ';'; IF NOT FOUND THEN -- we do not insert negative counts, just return true IF NOT (inspam_count < 0 OR inham_count < 0) THEN - INSERT INTO bayes_token (id, token, spam_count, ham_count, atime) - VALUES (inuserid, _token, inspam_count, inham_count, inatime); + EXECUTE 'INSERT INTO ' || tt | ' (id, token, spam_count, ham_count, atime) + VALUES (' || inuserid || ', ' || quote_literal(_token) || ', ' + || inspam_count || ', ' || inham_count || ', ' || inatime || ');'; IF FOUND THEN new_tokens := new_tokens + 1; END IF; @@ -109,4 +114,4 @@ END IF; RETURN; END; -' LANGUAGE 'plpgsql'; +$$ LANGUAGE 'plpgsql';