Index: lib/Mail/SpamAssassin.pm =================================================================== --- lib/Mail/SpamAssassin.pm (.../trunk) (revision 602913) +++ lib/Mail/SpamAssassin.pm (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -671,7 +671,7 @@ sub finish_learner { my $self = shift; - $self->{bayes_scanner}->sanity_check_is_untied(1) if $self->{bayes_scanner}; + $self->{bayes_scanner}->force_close(1) if $self->{bayes_scanner}; 1; } @@ -1325,7 +1325,7 @@ } # make sure things are ready for scanning - $self->{bayes_scanner}->sanity_check_is_untied() if $self->{bayes_scanner}; + $self->{bayes_scanner}->force_close() if $self->{bayes_scanner}; $self->call_plugins("compile_now_finish", { use_user_prefs => $use_user_prefs, keep_userstate => $deal_with_userstate}); Index: lib/Mail/SpamAssassin/BayesStore/SQL.pm =================================================================== --- lib/Mail/SpamAssassin/BayesStore/SQL.pm (.../trunk) (revision 602913) +++ lib/Mail/SpamAssassin/BayesStore/SQL.pm (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -48,7 +48,7 @@ =head2 new -public class (Mail::SpamAssassin::BayesStore::SQL) new (Mail::Spamassassin::Bayes $bayes) +public class (Mail::SpamAssassin::BayesStore::SQL) new (Mail::Spamassassin::Plugin::Bayes $bayes) Description: This methods creates a new instance of the Mail::SpamAssassin::BayesStore::SQL @@ -648,7 +648,7 @@ } while (my ($token, $spam_count, $ham_count, $atime) = $sth->fetchrow_array()) { - my $prob = $self->{bayes}->compute_prob_for_token($token, $vars[1], $vars[2], + my $prob = $self->{bayes}->_compute_prob_for_token($token, $vars[1], $vars[2], $spam_count, $ham_count); $prob ||= 0.5; Index: lib/Mail/SpamAssassin/BayesStore/DBM.pm =================================================================== --- lib/Mail/SpamAssassin/BayesStore/DBM.pm (.../trunk) (revision 602913) +++ lib/Mail/SpamAssassin/BayesStore/DBM.pm (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -980,7 +980,7 @@ # We have the value already, so just unpack it. my ($ts, $th, $atime) = $self->tok_unpack ($tokvalue); - my $prob = $self->{bayes}->compute_prob_for_token($tok, $vars[1], $vars[2], $ts, $th); + my $prob = $self->{bayes}->_compute_prob_for_token($tok, $vars[1], $vars[2], $ts, $th); $prob ||= 0.5; my $encoded_tok = unpack("H*",$tok); Index: lib/Mail/SpamAssassin/Plugin.pm =================================================================== --- lib/Mail/SpamAssassin/Plugin.pm (.../trunk) (revision 602913) +++ lib/Mail/SpamAssassin/Plugin.pm (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -837,6 +837,106 @@ %{$self} = (); } +=item $plugin->learner_new () + +Used to support human-trained probabilistic classifiers like the BAYES_* ruleset. +Called when a new C object has been created; typically +when a new user's scan is about to start. + +=item $plugin->learn_message () + +Train the classifier with a training message. + +=over 4 + +=item isspam + +1 if the message is spam, 0 if it's non-spam. + +=item msg + +The message's C object. + +=item id + +An optional message-identification string, used internally to tag the message. +If it is C, one will be generated. It should be unique to that message. + +=back + +=item $plugin->forget_message () + +Tell the classifier to 'forget' its training about a specific message. + +=over 4 + +=item msg + +The message's C object. + +=item id + +An optional message-identification string, used internally to tag the message. +If it is C, one will be generated. It should be unique to that message. + +=back + +=item $plugin->learner_sync () + +Tell the classifier to 'sync' any pending changes against the current +user's training database. This is called by C. + +If you do not need to implement these for your classifier, create an +implementation that just contains C. + +=item $plugin->learner_expire_old_training () + +Tell the classifier to perform infrequent, time-consuming cleanup of +the current user's training database. This is called by C. + +If you do not need to implement these for your classifier, create an +implementation that just contains C. + +=item $plugin->learner_is_scan_available () + +Should return 1 if it is possible to use the current user's training data for +a message-scan operation, or 0 otherwise. + +=item $plugin->learner_dump_database () + +Dump information about the current user's training data to C. +This is called by C. + +=over 4 + +=item magic + +Set to 1 if "magic" name-value metadata should be dumped. + +=item toks + +Set to 1 if the database of tokens should be dumped. + +=item regex + +Either C to dump all tokens, or a value which specifies a regular expression +subset of the tokens to dump. + +=back + +=item $plugin->learner_close () + +Close any open databases. + +=over 4 + +=item quiet + +Set to 1 if warning messages should be suppressed. + +=back + =head1 HELPER APIS These methods provide an API for plugins to register themselves Index: lib/Mail/SpamAssassin/PerMsgStatus.pm =================================================================== --- lib/Mail/SpamAssassin/PerMsgStatus.pm (.../trunk) (revision 602913) +++ lib/Mail/SpamAssassin/PerMsgStatus.pm (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -244,7 +244,7 @@ $self->{main}->finish_learner(); # for now if (exists $self->{main}->{bayes_scanner}) { - $self->{main}->{bayes_scanner}->sanity_check_is_untied(); + $self->{main}->{bayes_scanner}->force_close(); } 1; } or do { @@ -988,52 +988,6 @@ return $text; } -sub bayes_report_make_list { - my $self = shift; - my $info = shift; - my $param = shift || "5"; - my ($limit,$fmt_arg,$more) = split /,/, $param; - - return "Tokens not available." unless defined $info; - - my %formats = ( - short => '$t', - Short => 'Token: \"$t\"', - compact => '$p-$D--$t', - Compact => 'Probability $p -declassification distance $D (\"+\" means > 9) --token: \"$t\"', - medium => '$p-$D-$N--$t', - long => '$p-$d--${h}h-${s}s--${a}d--$t', - Long => 'Probability $p -declassification distance $D --in ${h} ham messages -and ${s} spam messages --$a} days old--token:\"$t\"' - ); - - my $raw_fmt = (!$fmt_arg ? '$p-$D--$t' : $formats{$fmt_arg}); - - return "Invalid format, must be one of: ".join(",",keys %formats) - unless defined $raw_fmt; - - my $fmt = '"'.$raw_fmt.'"'; - my $amt = $limit < @$info ? $limit : @$info; - return "" unless $amt; - - my $Bayes = $self->{main}{bayes_scanner}; - return "Bayes not available" unless defined $Bayes; - my $ns = $self->{bayes_nspam}; - my $nh = $self->{bayes_nham}; - my $digit = sub { $_[0] > 9 ? "+" : $_[0] }; - my $now = time; - - join ', ', map { - my($t,$prob,$s,$h,$u) = @$_; - my $a = int(($now - $u)/(3600 * 24)); - my $d = $Bayes->compute_declassification_distance($ns,$nh,$s,$h,$prob); - my $p = sprintf "%.3f", $prob; - my $n = $s + $h; - my ($c,$o) = $prob < 0.5 ? ($h,$s) : ($s,$h); - my ($D,$S,$H,$C,$O,$N) = map &$digit($_), ($d,$s,$h,$c,$o,$n); - eval $fmt; ## no critic - } @{$info}[0..$amt-1]; -} - ########################################################################### # public API for plugins @@ -1216,33 +1170,6 @@ sprintf("%3.4f", $self->{bayes_score}) : "0.5" }, - HAMMYTOKENS => sub { - $self->bayes_report_make_list - ( $self->{bayes_token_info_hammy}, shift ); - }, - - SPAMMYTOKENS => sub { - $self->bayes_report_make_list - ( $self->{bayes_token_info_spammy}, shift ); - }, - - TOKENSUMMARY => sub { - if( defined $self->{tag_data}{BAYESTC} ) - { - my $tcount_neutral = $self->{tag_data}{BAYESTCLEARNED} - - $self->{tag_data}{BAYESTCSPAMMY} - - $self->{tag_data}{BAYESTCHAMMY}; - my $tcount_new = $self->{tag_data}{BAYESTC} - - $self->{tag_data}{BAYESTCLEARNED}; - "Tokens: new, $tcount_new; " - ."hammy, $self->{tag_data}{BAYESTCHAMMY}; " - ."neutral, $tcount_neutral; " - ."spammy, $self->{tag_data}{BAYESTCSPAMMY}." - } else { - "Bayes not run."; - } - }, - DATE => \&Mail::SpamAssassin::Util::time_to_rfc822_date, STARS => sub { Index: lib/Mail/SpamAssassin/PerMsgLearner.pm =================================================================== --- lib/Mail/SpamAssassin/PerMsgLearner.pm (.../trunk) (revision 602913) +++ lib/Mail/SpamAssassin/PerMsgLearner.pm (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -90,8 +90,8 @@ # Learn the message as spam. # # C<$id> is an optional message-identification string, used internally -# to tag the message. If it is C, the Message-Id of the message -# will be used. It should be unique to that message. +# to tag the message. If it is C, one will be generated. +# It should be unique to that message. # # This is a semi-private API; callers should use # C<$spamtest-Elearn($mail,$id,$isspam,$forget)> instead. @@ -117,8 +117,8 @@ # Learn the message as ham. # # C<$id> is an optional message-identification string, used internally -# to tag the message. If it is C, the Message-Id of the message -# will be used. It should be unique to that message. +# to tag the message. If it is C, one will be generated. +# It should be unique to that message. # # This is a semi-private API; callers should use # C<$spamtest-Elearn($mail,$id,$isspam,$forget)> instead. @@ -141,8 +141,8 @@ # Forget about a previously-learned message. # # C<$id> is an optional message-identification string, used internally -# to tag the message. If it is C, the Message-Id of the message -# will be used. It should be unique to that message. +# to tag the message. If it is C, one will be generated. +# It should be unique to that message. # # This is a semi-private API; callers should use # C<$spamtest-Elearn($mail,$id,$isspam,$forget)> instead. Index: lib/Mail/SpamAssassin/BayesStore.pm =================================================================== --- lib/Mail/SpamAssassin/BayesStore.pm (.../trunk) (revision 602913) +++ lib/Mail/SpamAssassin/BayesStore.pm (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -17,12 +17,12 @@ =head1 NAME -Mail::SpamAssassin::BayesStore - Bayesian Storage Module +Mail::SpamAssassin::BayesStore - Storage Module for default Bayes classifier =head1 DESCRIPTION This is the public API for the Bayesian store methods. Any implementation of -the storage module must implement these methods. +the storage module for the default Bayes classifier must implement these methods. =cut @@ -43,12 +43,12 @@ =item new -public class (Mail::SpamAssassin::BayesStore) new (Mail::SpamAssassin::Bayes $bayes) +public class (Mail::SpamAssassin::BayesStore) new (Mail::SpamAssassin::Plugin::Bayes $bayes) Description: This method creates a new instance of the Mail::SpamAssassin::BayesStore -object. You must pass in an instance of the Mail::SpamAssassin:Bayes object, -which is stashed for use throughout the module. +object. You must pass in an instance of the Mail::SpamAssassin::Plugin::Bayes +object, which is stashed for use throughout the module. =cut @@ -89,7 +89,7 @@ Description: This method reads any needed config variables from the configuration -object and then calls the Mail::SpamAssassin::Bayes read_db_configs method. +object and then calls the Mail::SpamAssassin::Plugin::Plugin::Bayes read_db_configs method. =cut Index: lib/Mail/SpamAssassin/Plugin/WLBLEval.pm =================================================================== --- lib/Mail/SpamAssassin/Plugin/WLBLEval.pm (.../trunk) (revision 602913) +++ lib/Mail/SpamAssassin/Plugin/WLBLEval.pm (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -121,6 +121,8 @@ return 0; } +# TODO: this should be moved to a utility module off PerMsgStatus, +# rather than a plugin API; it's used in Bayes.pm as a utility sub check_wb_list { my ($self, $params) = @_; Index: lib/Mail/SpamAssassin/Plugin/Bayes.pm =================================================================== --- lib/Mail/SpamAssassin/Plugin/Bayes.pm (.../trunk) (revision 602913) +++ lib/Mail/SpamAssassin/Plugin/Bayes.pm (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -15,42 +15,256 @@ # limitations under the License. # +=head1 NAME + +Mail::SpamAssassin::Plugin::Bayes - determine spammishness using a Bayesian classifier + +=head1 DESCRIPTION + +This is a Bayesian-style probabilistic classifier, using an algorithm based on +the one detailed in Paul Graham's I paper at: + + http://www.paulgraham.com/spam.html + +It also incorporates some other aspects taken from Graham Robinson's webpage +on the subject at: + + http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html + +And the chi-square probability combiner as described here: + + http://www.linuxjournal.com/print.php?sid=6467 + +The results are incorporated into SpamAssassin as the BAYES_* rules. + +=head1 METHODS + +=over 4 + +=cut + package Mail::SpamAssassin::Plugin::Bayes; -use Mail::SpamAssassin::Plugin; use strict; use warnings; use bytes; use re 'taint'; -use vars qw(@ISA); -@ISA = qw(Mail::SpamAssassin::Plugin); +use Mail::SpamAssassin; +use Mail::SpamAssassin::Plugin; +use Mail::SpamAssassin::PerMsgStatus; +use Mail::SpamAssassin::Logger; +use Mail::SpamAssassin::Util qw(untaint_var); -# constructor: register the eval rule +# pick ONLY ONE of these combining implementations. +use Mail::SpamAssassin::Bayes::CombineChi; +# use Mail::SpamAssassin::Bayes::CombineNaiveBayes; + +use Digest::SHA1 qw(sha1 sha1_hex); + +our @ISA = qw(Mail::SpamAssassin::Plugin); + +use vars qw{ + $IGNORED_HDRS + $MARK_PRESENCE_ONLY_HDRS + %HEADER_NAME_COMPRESSION + $OPPORTUNISTIC_LOCK_VALID +}; + +# Which headers should we scan for tokens? Don't use all of them, as it's easy +# to pick up spurious clues from some. What we now do is use all of them +# *less* these well-known headers; that way we can pick up spammers' tracking +# headers (which are obviously not well-known in advance!). + +# Received is handled specially +$IGNORED_HDRS = qr{(?: (?:X-)?Sender # misc noise + |Delivered-To |Delivery-Date + |(?:X-)?Envelope-To + |X-MIME-Auto[Cc]onverted |X-Converted-To-Plain-Text + + |Subject # not worth a tiny gain vs. to db size increase + + # Date: can provide invalid cues if your spam corpus is + # older/newer than ham + |Date + + # List headers: ignore. a spamfiltering mailing list will + # become a nonspam sign. + |X-List|(?:X-)?Mailing-List + |(?:X-)?List-(?:Archive|Help|Id|Owner|Post|Subscribe + |Unsubscribe|Host|Id|Manager|Admin|Comment + |Name|Url) + |X-Unsub(?:scribe)? + |X-Mailman-Version |X-Been[Tt]here |X-Loop + |Mail-Followup-To + |X-eGroups-(?:Return|From) + |X-MDMailing-List + |X-XEmacs-List + + # gatewayed through mailing list (thanks to Allen Smith) + |(?:X-)?Resent-(?:From|To|Date) + |(?:X-)?Original-(?:From|To|Date) + + # Spamfilter/virus-scanner headers: too easy to chain from + # these + |X-MailScanner(?:-SpamCheck)? + |X-Spam(?:-(?:Status|Level|Flag|Report|Hits|Score|Checker-Version))? + |X-Antispam |X-RBL-Warning |X-Mailscanner + |X-MDaemon-Deliver-To |X-Virus-Scanned + |X-Mass-Check-Id + |X-Pyzor |X-DCC-\S{2,25}-Metrics + |X-Filtered-B[Yy] |X-Scanned-By |X-Scanner + |X-AP-Spam-(?:Score|Status) |X-RIPE-Spam-Status + |X-SpamCop-[^:]+ + |X-SMTPD |(?:X-)?Spam-Apparently-To + |SPAM |X-Perlmx-Spam + |X-Bogosity + + # some noisy Outlook headers that add no good clues: + |Content-Class |Thread-(?:Index|Topic) + |X-Original[Aa]rrival[Tt]ime + + # Annotations from IMAP, POP, and MH: + |(?:X-)?Status |X-Flags |Replied |Forwarded + |Lines |Content-Length + |X-UIDL? |X-IMAPbase + + # Annotations from Bugzilla + |X-Bugzilla-[^:]+ + + # Annotations from VM: (thanks to Allen Smith) + |X-VM-(?:Bookmark|(?:POP|IMAP)-Retrieved|Labels|Last-Modified + |Summary-Format|VHeader|v\d-Data|Message-Order) + + # Annotations from Gnus: + | X-Gnus-Mail-Source + | Xref + +)}x; + +# Note only the presence of these headers, in order to reduce the +# hapaxen they generate. +$MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face + |X-(?:Gnu-?PG|PGP|GPG)(?:-Key)?-Fingerprint +)}ix; + +# tweaks tested as of Nov 18 2002 by jm: see SpamAssassin-devel list archives +# for results. The winners are now the default settings. +use constant IGNORE_TITLE_CASE => 1; +use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 1; +use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; + +# tweaks of May 12 2003, see SpamAssassin-devel archives again. +use constant PRE_CHEW_ADDR_HEADERS => 1; +use constant CHEW_BODY_URIS => 1; +use constant CHEW_BODY_MAILADDRS => 1; +use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; +use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; +use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0; +use constant IGNORE_MSGID_TOKENS => 0; + +# tweaks of 12 March 2004, see bug 2129. +use constant DECOMPOSE_BODY_TOKENS => 1; +use constant MAP_HEADERS_MID => 1; +use constant MAP_HEADERS_FROMTOCC => 1; +use constant MAP_HEADERS_USERAGENT => 1; + +# tweaks, see http://issues.apache.org/SpamAssassin/show_bug.cgi?id=3173#c26 +use constant ADD_INVIZ_TOKENS_I_PREFIX => 1; +use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0; + +# We store header-mined tokens in the db with a "HHeaderName:val" format. +# some headers may contain lots of gibberish tokens, so allow a little basic +# compression by mapping the header name at least here. these are the headers +# which appear with the most frequency in my db. note: this doesn't have to +# be 2-way (ie. LHSes that map to the same RHS are not a problem), but mixing +# tokens from multiple different headers may impact accuracy, so might as well +# avoid this if possible. These are the top ones from my corpus, BTW (jm). +%HEADER_NAME_COMPRESSION = ( + 'Message-Id' => '*m', + 'Message-ID' => '*M', + 'Received' => '*r', + 'User-Agent' => '*u', + 'References' => '*f', + 'In-Reply-To' => '*i', + 'From' => '*F', + 'Reply-To' => '*R', + 'Return-Path' => '*p', + 'Return-path' => '*rp', + 'X-Mailer' => '*x', + 'X-Authentication-Warning' => '*a', + 'Organization' => '*o', + 'Organisation' => '*o', + 'Content-Type' => '*c', + 'X-Spam-Relays-Trusted' => '*RT', + 'X-Spam-Relays-Untrusted' => '*RU', +); + +# How many seconds should the opportunistic_expire lock be valid? +$OPPORTUNISTIC_LOCK_VALID = 300; + +# Should we use the Robinson f(w) equation from +# http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html ? +# It gives better results, in that scores are more likely to distribute +# into the <0.5 range for nonspam and >0.5 for spam. +use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1; + +# How many of the most significant tokens should we use for the p(w) +# calculation? +use constant N_SIGNIFICANT_TOKENS => 150; + +# How many significant tokens are required for a classifier score to +# be considered usable? +use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1; + +# How long a token should we hold onto? (note: German speakers typically +# will require a longer token than English ones.) +use constant MAX_TOKEN_LENGTH => 15; + +########################################################################### + sub new { my $class = shift; - my $mailsaobject = shift; + my ($main) = @_; - # some boilerplate... $class = ref($class) || $class; - my $self = $class->SUPER::new($mailsaobject); + my $self = $class->SUPER::new($main); bless ($self, $class); - # the important bit! + $self->{main} = $main; + $self->{conf} = $main->{conf}; + $self->{use_ignores} = 1; + $self->register_eval_rule("check_bayes"); + $self; +} - return $self; +sub finish { + my $self = shift; + if ($self->{store}) { + $self->{store}->untie_db(); + } + %{$self} = (); } +# Plugin hook. +# Return this implementation object, for callers that need to know +# it. TODO: callers shouldn't *need* to know it! +# used only in test suite to get access to {store}, internal APIs. +# +sub learner_get_implementation { return shift; } + +########################################################################### + sub check_bayes { my ($self, $pms, $fulltext, $min, $max) = @_; + return 0 if (!$pms->{conf}->{use_learner}); return 0 if (!$pms->{conf}->{use_bayes} || !$pms->{conf}->{use_bayes_rules}); if (!exists ($pms->{bayes_score})) { my $timer = $self->{main}->time_method("check_bayes"); - $pms->{bayes_score} = - $self->{main}->{bayes_scanner}->scan ($pms, $pms->{msg}); + $pms->{bayes_score} = $self->scan($pms, $pms->{msg}); } if (defined $pms->{bayes_score} && @@ -71,4 +285,1264 @@ return 0; } +########################################################################### + +# Plugin hook. +sub learner_close { + my ($self, $params) = @_; + my $quiet = $params->{quiet}; + + # do a sanity check here. Wierd things happen if we remain tied + # after compiling; for example, spamd will never see that the + # number of messages has reached the bayes-scanning threshold. + if ($self->{store}->db_readable()) { + warn "bayes: oops! still tied to bayes DBs, untying\n" unless $quiet; + $self->{store}->untie_db(); + } +} + +########################################################################### + +# read configuration items to control bayes behaviour. Called by +# BayesStore::read_db_configs(). +sub read_db_configs { + my ($self) = @_; + + # use of hapaxes. Set on bayes object, since it controls prob + # computation. + $self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes}; +} +########################################################################### + +sub ignore_message { + my ($self,$PMS) = @_; + + return 0 unless $self->{use_ignores}; + + my $ig_from = $self->{main}->call_plugins ("check_wb_list", + { permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' }); + my $ig_to = $self->{main}->call_plugins ("check_wb_list", + { permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' }); + + my $ignore = $ig_from || $ig_to; + + dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore; + + return $ignore; +} + +########################################################################### + +# Plugin hook. +sub learn_message { + my ($self, $params) = @_; + my $isspam = $params->{isspam}; + my $msg = $params->{msg}; + my $id = $params->{id}; + + if (!$self->{conf}->{use_bayes}) { return; } + + my $msgdata = $self->get_body_from_msg ($msg); + my $ret; + + eval { + local $SIG{'__DIE__'}; # do not run user die() traps in here + + my $ok; + if ($self->{main}->{learn_to_journal}) { + # If we're going to learn to journal, we'll try going r/o first... + # If that fails for some reason, let's try going r/w. This happens + # if the DB doesn't exist yet. + $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable(); + } else { + $ok = $self->{store}->tie_db_writable(); + } + + if ($ok) { + $ret = $self->_learn_trapped ($isspam, $msg, $msgdata, $id); + + if (!$self->{main}->{learn_caller_will_untie}) { + $self->{store}->untie_db(); + } + } + 1; + } or do { # if we died, untie the dbs. + my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; + $self->{store}->untie_db(); + die "bayes: (in learn) $eval_stat\n"; + }; + + return $ret; +} + +# this function is trapped by the wrapper above +sub _learn_trapped { + my ($self, $isspam, $msg, $msgdata, $msgid) = @_; + my @msgid = ( $msgid ); + + if (!defined $msgid) { + @msgid = $self->get_msgid($msg); + } + + foreach $msgid ( @msgid ) { + my $seen = $self->{store}->seen_get ($msgid); + + if (defined ($seen)) { + if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) { + dbg("bayes: $msgid already learnt correctly, not learning twice"); + return 0; + } elsif ($seen !~ /^[hs]$/) { + warn("bayes: db_seen corrupt: value='$seen' for $msgid, ignored"); + } else { + # bug 3704: If the message was already learned, don't try learning it again. + # this prevents, for instance, manually learning as spam, then autolearning + # as ham, or visa versa. + if ($self->{main}->{learn_no_relearn}) { + dbg("bayes: $msgid already learnt as opposite, not re-learning"); + return 0; + } + + dbg("bayes: $msgid already learnt as opposite, forgetting first"); + + # kluge so that forget() won't untie the db on us ... + my $orig = $self->{main}->{learn_caller_will_untie}; + $self->{main}->{learn_caller_will_untie} = 1; + + my $fatal = !defined $self->{main}->{bayes_scanner}->forget ($msg); + + # reset the value post-forget() ... + $self->{main}->{learn_caller_will_untie} = $orig; + + # forget() gave us a fatal error, so propagate that up + if ($fatal) { + dbg("bayes: forget() returned a fatal error, so learn() will too"); + return; + } + } + + # we're only going to have seen this once, so stop if it's been + # seen already + last; + } + } + + # Now that we're sure we haven't seen this message before ... + $msgid = $msgid[0]; + + if ($isspam) { + $self->{store}->nspam_nham_change (1, 0); + } else { + $self->{store}->nspam_nham_change (0, 1); + } + + my $msgatime = $msg->receive_date(); + + # If the message atime comes back as being more than 1 day in the + # future, something's messed up and we should revert to current time as + # a safety measure. + # + $msgatime = time if ( $msgatime - time > 86400 ); + + my $tokens = $self->tokenize($msg, $msgdata); + + if ($isspam) { + $self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime); + } else { + $self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime); + } + + $self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h')); + $self->{store}->cleanup(); + + $self->{main}->call_plugins("bayes_learn", { toksref => $tokens, + isspam => $isspam, + msgid => $msgid, + msgatime => $msgatime, + }); + + dbg("bayes: learned '$msgid', atime: $msgatime"); + + 1; +} + +########################################################################### + +# Plugin hook. +sub forget_message { + my ($self, $params) = @_; + my $msg = $params->{msg}; + my $id = $params->{id}; + + if (!$self->{conf}->{use_bayes}) { return; } + + my $msgdata = $self->get_body_from_msg ($msg); + my $ret; + + # we still tie for writing here, since we write to the seen db + # synchronously + eval { + local $SIG{'__DIE__'}; # do not run user die() traps in here + + my $ok; + if ($self->{main}->{learn_to_journal}) { + # If we're going to learn to journal, we'll try going r/o first... + # If that fails for some reason, let's try going r/w. This happens + # if the DB doesn't exist yet. + $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable(); + } else { + $ok = $self->{store}->tie_db_writable(); + } + + if ($ok) { + $ret = $self->_forget_trapped ($msg, $msgdata, $id); + + if (!$self->{main}->{learn_caller_will_untie}) { + $self->{store}->untie_db(); + } + } + 1; + } or do { # if we died, untie the dbs. + my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; + $self->{store}->untie_db(); + die "bayes: (in forget) $eval_stat\n"; + }; + + return $ret; +} + +# this function is trapped by the wrapper above +sub _forget_trapped { + my ($self, $msg, $msgdata, $msgid) = @_; + my @msgid = ( $msgid ); + my $isspam; + + if (!defined $msgid) { + @msgid = $self->get_msgid($msg); + } + + while( $msgid = shift @msgid ) { + my $seen = $self->{store}->seen_get ($msgid); + + if (defined ($seen)) { + if ($seen eq 's') { + $isspam = 1; + } elsif ($seen eq 'h') { + $isspam = 0; + } else { + dbg("bayes: forget: msgid $msgid seen entry is neither ham nor spam, ignored"); + return 0; + } + + # messages should only be learned once, so stop if we find a msgid + # which was seen before + last; + } + else { + dbg("bayes: forget: msgid $msgid not learnt, ignored"); + } + } + + # This message wasn't learnt before, so return + if (!defined $isspam) { + dbg("bayes: forget: no msgid from this message has been learnt, skipping message"); + return 0; + } + elsif ($isspam) { + $self->{store}->nspam_nham_change (-1, 0); + } + else { + $self->{store}->nspam_nham_change (0, -1); + } + + my $tokens = $self->tokenize($msg, $msgdata); + + if ($isspam) { + $self->{store}->multi_tok_count_change (-1, 0, $tokens); + } else { + $self->{store}->multi_tok_count_change (0, -1, $tokens); + } + + $self->{store}->seen_delete ($msgid); + $self->{store}->cleanup(); + + $self->{main}->call_plugins("bayes_forget", { toksref => $tokens, + isspam => $isspam, + msgid => $msgid, + }); + + 1; +} + +########################################################################### + +# Plugin hook. +sub learner_sync { + my ($self, $params) = @_; + if (!$self->{conf}->{use_bayes}) { return 0; } + dbg("bayes: bayes journal sync starting"); + $self->{store}->sync($params); + dbg("bayes: bayes journal sync completed"); +} + +########################################################################### + +# Plugin hook. +sub learner_expire_old_training { + my ($self, $params) = @_; + if (!$self->{conf}->{use_bayes}) { return 0; } + dbg("bayes: expiry starting"); + $self->{store}->expire_old_tokens($params); + dbg("bayes: expiry completed"); +} + +########################################################################### + +# Plugin hook. +# Check to make sure we can tie() the DB, and we have enough entries to do a scan +# if we're told the caller will untie(), go ahead and leave the db tied. +sub learner_is_scan_available { + my ($self, $params) = @_; + + return 0 unless $self->{conf}->{use_bayes}; + return 0 unless $self->{store}->tie_db_readonly(); + + # We need the DB to stay tied, so if the journal sync occurs, don't untie! + my $caller_untie = $self->{main}->{learn_caller_will_untie}; + $self->{main}->{learn_caller_will_untie} = 1; + + # Do a journal sync if necessary. Do this before the nspam_nham_get() + # call since the sync may cause an update in the number of messages + # learnt. + $self->_opportunistic_calls(1); + + # Reset the variable appropriately + $self->{main}->{learn_caller_will_untie} = $caller_untie; + + my ($ns, $nn) = $self->{store}->nspam_nham_get(); + + if ($ns < $self->{conf}->{bayes_min_spam_num}) { + dbg("bayes: not available for scanning, only $ns spam(s) in bayes DB < ".$self->{conf}->{bayes_min_spam_num}); + if (!$self->{main}->{learn_caller_will_untie}) { + $self->{store}->untie_db(); + } + return 0; + } + if ($nn < $self->{conf}->{bayes_min_ham_num}) { + dbg("bayes: not available for scanning, only $nn ham(s) in bayes DB < ".$self->{conf}->{bayes_min_ham_num}); + if (!$self->{main}->{learn_caller_will_untie}) { + $self->{store}->untie_db(); + } + return 0; + } + + return 1; +} + +########################################################################### + +sub scan { + my ($self, $permsgstatus, $msg) = @_; + my $score; + + return unless $self->{conf}->{use_learner}; + + # When we're doing a scan, we'll guarantee that we'll do the untie, + # so override the global setting until we're done. + my $caller_untie = $self->{main}->{learn_caller_will_untie}; + $self->{main}->{learn_caller_will_untie} = 1; + + goto skip if ($self->{main}->{bayes_scanner}->ignore_message($permsgstatus)); + + goto skip unless $self->learner_is_scan_available(); + + my ($ns, $nn) = $self->{store}->nspam_nham_get(); + + ## if ($self->{log_raw_counts}) { # see _compute_prob_for_token() + ## $self->{raw_counts} = " ns=$ns nn=$nn "; + ## } + + dbg("bayes: corpus size: nspam = $ns, nham = $nn"); + + my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus); + + my $msgtokens = $self->tokenize($msg, $msgdata); + + my $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens}); + + my %pw; + + foreach my $tokendata (@{$tokensdata}) { + my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata}; + my $prob = $self->_compute_prob_for_token($token, $ns, $nn, $tok_spam, $tok_ham); + next unless defined $prob; + + $pw{$token} = { + prob => $prob, + spam_count => $tok_spam, + ham_count => $tok_ham, + atime => $atime + }; + } + + # If none of the tokens were found in the DB, we're going to skip + # this message... + if (!keys %pw) { + dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database"); + goto skip; + } + + my $tcount_total = keys %{$msgtokens}; + my $tcount_learned = keys %pw; + + # Figure out the message receive time (used as atime below) + # If the message atime comes back as being in the future, something's + # messed up and we should revert to current time as a safety measure. + # + my $msgatime = $msg->receive_date(); + my $now = time; + $msgatime = $now if ( $msgatime > $now ); + + # now take the $count most significant tokens and calculate probs using + # Robinson's formula. + my $count = N_SIGNIFICANT_TOKENS; + my @sorted; + + my @touch_tokens; + my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = []; + my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = []; + + my %tok_strength = map { $_ => (abs($pw{$_}->{prob} - 0.5)) } keys %pw; + my $log_each_token = (would_log('dbg', 'bayes') > 1); + + foreach my $tok (sort { + $tok_strength{$b} <=> $tok_strength{$a} + } keys %pw) + { + if ($count-- < 0) { last; } + next if ($tok_strength{$tok} < + $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH); + + my $pw = $pw{$tok}->{prob}; + + # What's more expensive, scanning headers for HAMMYTOKENS and + # SPAMMYTOKENS tags that aren't there or collecting data that + # won't be used? Just collecting the data is certainly simpler. + # + my $raw_token = $msgtokens->{$tok} || "(unknown)"; + my $s = $pw{$tok}->{spam_count}; + my $n = $pw{$tok}->{ham_count}; + my $a = $pw{$tok}->{atime}; + + if ($pw < 0.5) { + push @$tinfo_hammy, [$raw_token,$pw,$s,$n,$a]; + } else { + push @$tinfo_spammy, [$raw_token,$pw,$s,$n,$a]; + } + + push (@sorted, $pw); + + # update the atime on this token, it proved useful + push(@touch_tokens, $tok); + + if ($log_each_token) { + dbg("bayes: token '$raw_token' => $pw"); + } + } + + if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 && + $#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE)) + { + dbg("bayes: cannot use bayes on this message; not enough usable tokens found"); + goto skip; + } + + $score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted); + + # Couldn't come up with a probability? + goto skip unless defined $score; + + dbg("bayes: score = $score"); + + # no need to call tok_touch_all unless there were significant + # tokens and a score was returned + # we don't really care about the return value here + $self->{store}->tok_touch_all(\@touch_tokens, $msgatime); + + $permsgstatus->{bayes_nspam} = $ns; + $permsgstatus->{bayes_nham} = $nn; + + ## if ($self->{log_raw_counts}) { # see _compute_prob_for_token() + ## print "#Bayes-Raw-Counts: $self->{raw_counts}\n"; + ## } + + $self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens, + probsref => \%pw, + score => $score, + msgatime => $msgatime, + significant_tokens => \@touch_tokens, + }); + +skip: + if (!defined $score) { + dbg("bayes: not scoring message, returning undef"); + } + + # Take any opportunistic actions we can take + if ($self->{main}->{opportunistic_expire_check_only}) { + # we're supposed to report on expiry only -- so do the + # _opportunistic_calls() run for the journal only. + $self->_opportunistic_calls(1); + $permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due(); + } + else { + $self->_opportunistic_calls(); + } + + # Do any cleanup we need to do + $self->{store}->cleanup(); + + # Reset the value accordingly + $self->{main}->{learn_caller_will_untie} = $caller_untie; + + # If our caller won't untie the db, we need to do it. + if (!$caller_untie) { + $self->{store}->untie_db(); + } + + $permsgstatus->set_tag ('BAYESTCHAMMY', + ($tinfo_hammy ? scalar @{$tinfo_hammy} : 0)); + $permsgstatus->set_tag ('BAYESTCSPAMMY', + ($tinfo_spammy ? scalar @{$tinfo_spammy} : 0)); + $permsgstatus->set_tag ('BAYESTCLEARNED', $tcount_learned); + $permsgstatus->set_tag ('BAYESTC', $tcount_total); + + $permsgstatus->set_tag ('HAMMYTOKENS', sub { + $self->bayes_report_make_list + ($permsgstatus, $permsgstatus->{bayes_token_info_hammy}, shift); + }); + + $permsgstatus->set_tag ('SPAMMYTOKENS', sub { + $self->bayes_report_make_list + ($permsgstatus, $permsgstatus->{bayes_token_info_spammy}, shift); + }); + + $permsgstatus->set_tag ('TOKENSUMMARY', sub { + if( defined $self->{tag_data}{BAYESTC} ) + { + my $tcount_neutral = $permsgstatus->{tag_data}{BAYESTCLEARNED} + - $permsgstatus->{tag_data}{BAYESTCSPAMMY} + - $permsgstatus->{tag_data}{BAYESTCHAMMY}; + my $tcount_new = $permsgstatus->{tag_data}{BAYESTC} + - $permsgstatus->{tag_data}{BAYESTCLEARNED}; + "Tokens: new, $tcount_new; " + ."hammy, $permsgstatus->{tag_data}{BAYESTCHAMMY}; " + ."neutral, $tcount_neutral; " + ."spammy, $permsgstatus->{tag_data}{BAYESTCSPAMMY}." + } else { + "Bayes not run."; + } + }); + + + return $score; +} + +########################################################################### + +# Plugin hook. +sub learner_dump_database { + my ($self, $params) = @_; + my $magic = $params->{magic}; + my $toks = $params->{toks}; + my $regex = $params->{regex}; + + # allow dump to occur even if use_bayes disables everything else ... + #return 0 unless $self->{conf}->{use_bayes}; + return 0 unless $self->{store}->tie_db_readonly(); + + my @vars = $self->{store}->get_storage_variables(); + + my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars; + + my $template = '%3.3f %10u %10u %10u %s'."\n"; + + if ( $magic ) { + printf ($template, 0.0, 0, $bv, 0, 'non-token data: bayes db version'); + printf ($template, 0.0, 0, $ns, 0, 'non-token data: nspam'); + printf ($template, 0.0, 0, $nh, 0, 'non-token data: nham'); + printf ($template, 0.0, 0, $nt, 0, 'non-token data: ntokens'); + printf ($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime'); + printf ($template, 0.0, 0, $na, 0, 'non-token data: newest atime') if ( $bv >= 2 ); + printf ($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count') if ( $bv < 2 ); + printf ($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime') if ( $bv >= 2 ); + printf ($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime'); + if ( $bv >= 2 ) { + printf ($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta'); + printf ($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count'); + } + } + + if ( $toks ) { + # let the store sort out the db_toks + $self->{store}->dump_db_toks($template, $regex, @vars); + } + + if (!$self->{main}->{learn_caller_will_untie}) { + $self->{store}->untie_db(); + } + return 1; +} + +########################################################################### +# TODO: these are NOT public, but the test suite needs to call them. + +sub get_msgid { + my ($self, $msg) = @_; + + my @msgid; + + my $msgid = $msg->get_header("Message-Id"); + if (defined $msgid && $msgid ne '' && $msgid !~ /^\s*<\s*(?:\@sa_generated)?>.*$/) { + # remove \r and < and > prefix/suffixes + chomp $msgid; + $msgid =~ s/^.*$//g; + push(@msgid, $msgid); + } + + # Use sha1_hex(Date:, last received: and top N bytes of body) + # where N is MIN(1024 bytes, 1/2 of body length) + # + my $date = $msg->get_header("Date"); + $date = "None" if (!defined $date || $date eq ''); # No Date? + + my @rcvd = $msg->get_header("Received"); + my $rcvd = $rcvd[$#rcvd]; + $rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received? + + # Make a copy since pristine_body is a reference ... + my $body = join('', $msg->get_pristine_body()); + if (length($body) > 64) { # Small Body? + my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) ); + substr($body, $keep) = ''; + } + + unshift(@msgid, sha1_hex($date."\000".$rcvd."\000".$body).'@sa_generated'); + + return wantarray ? @msgid : $msgid[0]; +} + +sub get_body_from_msg { + my ($self, $msg) = @_; + + if (!ref $msg) { + # I have no idea why this seems to happen. TODO + warn "bayes: msg not a ref: '$msg'"; + return { }; + } + + my $permsgstatus = + Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg); + $msg->extract_message_metadata ($permsgstatus); + my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus); + $permsgstatus->finish(); + + if (!defined $msgdata) { + # why?! + warn "bayes: failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n"; + return { }; + } + + return $msgdata; +} + +sub _get_msgdata_from_permsgstatus { + my ($self, $msg) = @_; + + my $msgdata = { }; + $msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array(); + $msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array(); + @{$msgdata->{bayes_token_uris}} = $msg->get_uri_list(); + return $msgdata; +} + +########################################################################### + +# The calling functions expect a uniq'ed array of tokens ... +sub tokenize { + my ($self, $msg, $msgdata) = @_; + + # the body + my @tokens = map { $self->_tokenize_line ($_, '', 1) } + @{$msgdata->{bayes_token_body}}; + + # the URI list + push (@tokens, map { $self->_tokenize_line ($_, '', 2) } + @{$msgdata->{bayes_token_uris}}); + + # add invisible tokens + if (ADD_INVIZ_TOKENS_I_PREFIX) { + push (@tokens, map { $self->_tokenize_line ($_, "I*:", 1) } + @{$msgdata->{bayes_token_inviz}}); + } + if (ADD_INVIZ_TOKENS_NO_PREFIX) { + push (@tokens, map { $self->_tokenize_line ($_, "", 1) } + @{$msgdata->{bayes_token_inviz}}); + } + + # Tokenize the headers + my %hdrs = $self->_tokenize_headers ($msg); + while( my($prefix, $value) = each %hdrs ) { + push(@tokens, $self->_tokenize_line ($value, "H$prefix:", 0)); + } + + # Go ahead and uniq the array, skip null tokens (can happen sometimes) + # generate an SHA1 hash and take the lower 40 bits as our token + my %tokens; + foreach my $token (@tokens) { + next unless length($token); # skip 0 length tokens + $tokens{substr(sha1($token), -5)} = $token; + } + + # return the keys == tokens ... + return \%tokens; +} + +sub _tokenize_line { + my $self = $_[0]; + my $tokprefix = $_[2]; + my $region = $_[3]; + local ($_) = $_[1]; + + my @rettokens; + + # include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings, + # and ISO-8859-15 alphas. Do not split on @'s; better results keeping it. + # Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!" + tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs; + + # DO split on "..." or "--" or "---"; common formatting error resulting in + # hapaxes. Keep the separator itself as a token, though, as long ones can + # be good spamsigns. + s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs; + s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs; + + if (IGNORE_TITLE_CASE) { + if ($region == 1 || $region == 2) { + # lower-case Title Case at start of a full-stop-delimited line (as would + # be seen in a Western language). + s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '. (lc $1) . $2 . ' ' /ge; + } + } + + my $magic_re = $self->{store}->get_magic_re(); + + foreach my $token (split) { + $token =~ s/^[-'"\.,]+//; # trim non-alphanum chars at start or end + $token =~ s/[-'"\.,]+$//; # so we don't get loads of '"foo' tokens + + # Skip false magic tokens + # TVD: we need to do a defined() check since SQL doesn't have magic + # tokens, so the SQL BayesStore returns undef. I really want a way + # of optimizing that out, but I haven't come up with anything yet. + # + next if ( defined $magic_re && $token =~ /$magic_re/ ); + + # *do* keep 3-byte tokens; there's some solid signs in there + my $len = length($token); + + # but extend the stop-list. These are squarely in the gray + # area, and it just slows us down to record them. + # See http://wiki.apache.org/spamassassin/BayesStopList for more info. + # + next if $len < 3 || + ($token =~ /^(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$/i); + + # are we in the body? If so, apply some body-specific breakouts + if ($region == 1 || $region == 2) { + if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) { + push (@rettokens, $self->_tokenize_mail_addrs ($token)); + } + elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) { + push (@rettokens, "UD:".$token); # the full token + my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) { + push (@rettokens, "UD:".$1); # UD = URL domain + } + } + } + + # note: do not trim down overlong tokens if they contain '*'. This is + # used as part of split tokens such as "HTo:D*net" indicating that + # the domain ".net" appeared in the To header. + # + if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) { + if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) { + # Matt sez: "Could be asian? Autrijus suggested doing character ngrams, + # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan + # to me! (jm) + while ($token =~ s/^(..?)//) { + push (@rettokens, "8:$1"); + } + next; + } + + if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS) + || ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS) + || ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS)) + { + # if (TOKENIZE_LONG_TOKENS_AS_SKIPS) + # Spambayes trick via Matt: Just retain 7 chars. Do not retain + # the length, it does not help; see my mail to -devel of Nov 20 2002. + # "sk:" stands for "skip". + $token = "sk:".substr($token, 0, 7); + } + } + + # decompose tokens? do this after shortening long tokens + if ($region == 1 || $region == 2) { + if (DECOMPOSE_BODY_TOKENS) { + if ($token =~ /[^\w:\*]/) { + my $decompd = $token; # "Foo!" + $decompd =~ s/[^\w:\*]//gs; + push (@rettokens, $tokprefix.$decompd); # "Foo" + } + + if ($token =~ /[A-Z]/) { + my $decompd = $token; $decompd = lc $decompd; + push (@rettokens, $tokprefix.$decompd); # "foo!" + + if ($token =~ /[^\w:\*]/) { + $decompd =~ s/[^\w:\*]//gs; + push (@rettokens, $tokprefix.$decompd); # "foo" + } + } + } + } + + push (@rettokens, $tokprefix.$token); + } + + return @rettokens; +} + +sub _tokenize_headers { + my ($self, $msg) = @_; + + my %parsed; + + my %user_ignore; + $user_ignore{lc $_} = 1 for @{$self->{main}->{conf}->{bayes_ignore_headers}}; + + # get headers in array context + my @hdrs; + my @rcvdlines; + for ($msg->get_all_headers()) { + # first, keep a copy of Received headers, so we can strip down to last 2 + if (/^Received:/i) { + push(@rcvdlines, $_); + next; + } + # and now skip lines for headers we don't want (including all Received) + next if /^${IGNORED_HDRS}:/i; + next if IGNORE_MSGID_TOKENS && /^Message-ID:/i; + push(@hdrs, $_); + } + push(@hdrs, $msg->get_all_metadata()); + + # and re-add the last 2 received lines: usually a good source of + # spamware tokens and HELO names. + if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); } + if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); } + + for (@hdrs) { + next unless /\S/; + my ($hdr, $val) = split(/:/, $_, 2); + + # remove user-specified headers here, after Received, in case they + # want to ignore that too + next if exists $user_ignore{lc $hdr}; + + # Prep the header value + $val ||= ''; + chomp($val); + + # special tokenization for some headers: + if ($hdr =~ /^(?:|X-|Resent-)Message-Id$/i) { + $val = $self->_pre_chew_message_id ($val); + } + elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:|X-|Resent-) + (?:Return-Path|From|To|Cc|Reply-To|Errors-To|Mail-Followup-To|Sender)$/ix) + { + $val = $self->_pre_chew_addr_header ($val); + } + elsif ($hdr eq 'Received') { + $val = $self->_pre_chew_received ($val); + } + elsif ($hdr eq 'Content-Type') { + $val = $self->_pre_chew_content_type ($val); + } + elsif ($hdr eq 'MIME-Version') { + $val =~ s/1\.0//; # totally innocuous + } + elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) { + $val = "1"; # just mark the presence, they create lots of hapaxen + } + + if (MAP_HEADERS_MID) { + if ($hdr =~ /^(?:In-Reply-To|References|Message-ID)$/i) { + $parsed{"*MI"} = $val; + } + } + if (MAP_HEADERS_FROMTOCC) { + if ($hdr =~ /^(?:From|To|Cc)$/i) { + $parsed{"*Ad"} = $val; + } + } + if (MAP_HEADERS_USERAGENT) { + if ($hdr =~ /^(?:X-Mailer|User-Agent)$/i) { + $parsed{"*UA"} = $val; + } + } + + # replace hdr name with "compressed" version if possible + if (defined $HEADER_NAME_COMPRESSION{$hdr}) { + $hdr = $HEADER_NAME_COMPRESSION{$hdr}; + } + + if (exists $parsed{$hdr}) { + $parsed{$hdr} .= " ".$val; + } else { + $parsed{$hdr} = $val; + } + if (would_log('dbg', 'bayes') > 1) { + dbg("bayes: header tokens for $hdr = \"$parsed{$hdr}\""); + } + } + + return %parsed; +} + +sub _pre_chew_content_type { + my ($self, $val) = @_; + + # hopefully this will retain good bits without too many hapaxen + if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) { + my $boundary = $1; + $boundary =~ s/[a-fA-F0-9]/H/gs; + # break up blocks of separator chars so they become their own tokens + $boundary =~ s/([-_\.=]+)/ $1 /gs; + $val .= $boundary; + } + + # stop-list words for Content-Type header: these wind up totally gray + $val =~ s/\b(?:text|charset)\b//; + + $val; +} + +sub _pre_chew_message_id { + my ($self, $val) = @_; + # we can (a) get rid of a lot of hapaxen and (b) increase the token + # specificity by pre-parsing some common formats. + + # Outlook Express format: + $val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$ + ([0-9a-f]{4})[0-9a-f]{4}\$ + ([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx; + + # Exim: + $val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//; + + # Sendmail: + $val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\. + [A-F0-9]{10,12}\@//gx; + + # try to split Message-ID segments on probable ID boundaries. Note that + # Outlook message-ids seem to contain a server identifier ID in the last + # 8 bytes before the @. Make sure this becomes its own token, it's a + # great spam-sign for a learning system! Be sure to split on ".". + $val =~ s/[^_A-Za-z0-9]/ /g; + $val; +} + +sub _pre_chew_received { + my ($self, $val) = @_; + + # Thanks to Dan for these. Trim out "useless" tokens; sendmail-ish IDs + # and valid-format RFC-822/2822 dates + + $val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs; # Sendmail + $val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs; # Sendmail + $val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs; # Sendmail + $val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim + + $val =~ s/(?:(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)? + [0-3\s]?[0-9]\s + (?:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s + (?:19|20)?[0-9]{2}\s + [0-2][0-9](?:\:[0-5][0-9]){1,2}\s + (?:\s*\(|\)|\s*(?:[+-][0-9]{4})|\s*(?:UT|[A-Z]{2,3}T))* + //gx; + + # IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for + # IPs in the 10 and 192.168 ranges, they gets lots of significant tokens + # (on both sides) + # also make a dup with the full IP, as fodder for + # bayes_dump_to_trusted_networks: "H*r:ip*aaa.bbb.ccc.ddd" + $val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{ + if ($2 eq '10' || ($2 eq '192' && $3 eq '168')) { + $1.$2.$3.$4. + " ip*".$1.$2.$3.$4." "; + } else { + $1.$2.$3. + " ip*".$1.$2.$3.$4." "; + } + }gex; + + # trim these: they turn out as the most common tokens, but with a + # prob of about .5. waste of space! + $val =~ s/\b(?:with|from|for|SMTP|ESMTP)\b/ /g; + + $val; +} + +sub _pre_chew_addr_header { + my ($self, $val) = @_; + local ($_); + + my @addrs = $self->{main}->find_all_addrs_in_line ($val); + my @toks; + foreach (@addrs) { + push (@toks, $self->_tokenize_mail_addrs ($_)); + } + return join (' ', @toks); +} + +sub _tokenize_mail_addrs { + my ($self, $addr) = @_; + + ($addr =~ /(.+)\@(.+)$/) or return (); + my @toks; + push(@toks, "U*".$1, "D*".$2); + $_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); } + return @toks; +} + + +########################################################################### + +# compute the probability that a token is spammish +sub _compute_prob_for_token { + my ($self, $token, $ns, $nn, $s, $n) = @_; + + # we allow the caller to give us the token information, just + # to save a potentially expensive lookup + if (!defined($s) || !defined($n)) { + ($s, $n, undef) = $self->{store}->tok_get ($token); + } + + return if ($s == 0 && $n == 0); + + if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) { + return if ($s + $n < 10); # ignore low-freq tokens + } + + if (!$self->{use_hapaxes}) { + return if ($s + $n < 2); + } + + return if ( $ns == 0 || $nn == 0 ); + + my $ratios = ($s / $ns); + my $ration = ($n / $nn); + + my $prob; + + if ($ratios == 0 && $ration == 0) { + warn "bayes: oops? ratios == ration == 0"; + return; + } else { + $prob = ($ratios) / ($ration + $ratios); + } + + if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) { + # use Robinson's f(x) equation for low-n tokens, instead of just + # ignoring them + my $robn = $s+$n; + $prob = ($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob)) + / + ($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn); + } + + # 'log_raw_counts' is used to log the raw data for the Bayes equations during + # a mass-check, allowing the S and X constants to be optimized quickly + # without requiring re-tokenization of the messages for each attempt. There's + # really no need for this code to be uncommented in normal use, however. It + # has never been publicly documented, so commenting it out is fine. ;) + + ## if ($self->{log_raw_counts}) { + ## $self->{raw_counts} .= " s=$s,n=$n "; + ## } + + return $prob; +} + +########################################################################### +# If a token is neither hammy nor spammy, return 0. +# For a spammy token, return the minimum number of additional ham messages +# it would have had to appear in to no longer be spammy. Hammy tokens +# are handled similarly. That's what the function does (at the time +# of this writing, 31 July 2003, 16:02:55 CDT). It would be slightly +# more useful if it returned the number of /additional/ ham messages +# a spammy token would have to appear in to no longer be spammy but I +# fear that might require the solution to a cubic equation, and I +# just don't have the time for that now. + +sub _compute_declassification_distance { + my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_; + + return 0 if $ns == 0 && $nn == 0; + + if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);} + if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);} + + return 0 if $Ns == 0 || $Nn == 0; + return 0 if abs( $prob - 0.5 ) < + $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH; + + my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn); + my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH; + + return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na + unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS; + + my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT; + my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X; + my $a = $Nb * ( 1 - $p ); + my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb; + my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) ); + my $discrim = $b * $b - 4 * $a * $c; + my $disc_max_0 = $discrim < 0 ? 0 : $discrim; + my $dd_exact = ( 1.0 - 1e-6 + ( -$b + sqrt( $disc_max_0 ) ) / ( 2*$a ) ) - $na; + + # This shouldn't be necessary. Should not be < 1 + return $dd_exact < 1 ? 1 : int($dd_exact); +} + +########################################################################### + +sub _opportunistic_calls { + my($self, $journal_only) = @_; + + # If we're not already tied, abort. + if (!$self->{store}->db_readable()) { + dbg("bayes: opportunistic call attempt failed, DB not readable"); + return; + } + + # Is an expire or sync running? + my $running_expire = $self->{store}->get_running_expire_tok(); + if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) { + dbg("bayes: opportunistic call attempt skipped, found fresh running expire magic token"); + return; + } + + # handle expiry and syncing + if (!$journal_only && $self->{store}->expiry_due()) { + dbg("bayes: opportunistic call found expiry due"); + + # sync will bring the DB R/W as necessary, and the expire will remove + # the running_expire token, may untie as well. + $self->{main}->{bayes_scanner}->sync(1,1); + } + elsif ( $self->{store}->sync_due() ) { + dbg("bayes: opportunistic call found journal sync due"); + + # sync will bring the DB R/W as necessary, may untie as well + $self->{main}->{bayes_scanner}->sync(1,0); + + # We can only remove the running_expire token if we're doing R/W + if ($self->{store}->db_writable()) { + $self->{store}->remove_running_expire_tok(); + } + } + + return; +} + +########################################################################### + +sub learner_new { + my ($self) = @_; + + if ($self->{conf}->{bayes_store_module}) { + my $module = $self->{conf}->{bayes_store_module}; + $module = untaint_var($module); # good enough? + my $store; + + eval ' + require '.$module.'; + $store = '.$module.'->new($self); + 1; + ' or do { + my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; + die "bayes: (in new) $eval_stat\n"; + }; + $self->{store} = $store; + } + else { + require Mail::SpamAssassin::BayesStore::DBM; + $self->{store} = Mail::SpamAssassin::BayesStore::DBM->new($self); + } + + $self; +} + +########################################################################### + +sub bayes_report_make_list { + my ($self, $pms, $info, $param) = @_; + return "Tokens not available." unless defined $info; + + my ($limit,$fmt_arg,$more) = split /,/, ($param || '5'); + + my %formats = ( + short => '$t', + Short => 'Token: \"$t\"', + compact => '$p-$D--$t', + Compact => 'Probability $p -declassification distance $D (\"+\" means > 9) --token: \"$t\"', + medium => '$p-$D-$N--$t', + long => '$p-$d--${h}h-${s}s--${a}d--$t', + Long => 'Probability $p -declassification distance $D --in ${h} ham messages -and ${s} spam messages --${a} days old--token:\"$t\"' + ); + + my $raw_fmt = (!$fmt_arg ? '$p-$D--$t' : $formats{$fmt_arg}); + + return "Invalid format, must be one of: ".join(",",keys %formats) + unless defined $raw_fmt; + + my $fmt = '"'.$raw_fmt.'"'; + my $amt = $limit < @$info ? $limit : @$info; + return "" unless $amt; + + my $ns = $pms->{bayes_nspam}; + my $nh = $pms->{bayes_nham}; + my $digit = sub { $_[0] > 9 ? "+" : $_[0] }; + my $now = time; + + join ', ', map { + my($t,$prob,$s,$h,$u) = @$_; + my $a = int(($now - $u)/(3600 * 24)); + my $d = $self->_compute_declassification_distance($ns,$nh,$s,$h,$prob); + my $p = sprintf "%.3f", $prob; + my $n = $s + $h; + my ($c,$o) = $prob < 0.5 ? ($h,$s) : ($s,$h); + my ($D,$S,$H,$C,$O,$N) = map &$digit($_), ($d,$s,$h,$c,$o,$n); + eval $fmt; ## no critic + } @{$info}[0..$amt-1]; +} + 1; + +=back + +=cut Index: lib/Mail/SpamAssassin/Conf.pm =================================================================== --- lib/Mail/SpamAssassin/Conf.pm (.../trunk) (revision 602913) +++ lib/Mail/SpamAssassin/Conf.pm (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -1278,6 +1278,20 @@ =over 4 +=item use_learner ( 0 | 1 ) (default: 1) + +Whether to use any machine-learning classifiers with SpamAssassin, such as the +default 'BAYES_*' rules. Setting this to 0 will disable use of any and all +human-trained classifiers. + +=cut + + push (@cmds, { + setting => 'use_learner', + default => 1, + type => $CONF_TYPE_BOOL + }); + =item use_bayes ( 0 | 1 ) (default: 1) Whether to use the naive-Bayesian-style classifier built into @@ -2629,6 +2643,77 @@ } }); +##OSBF##=item osbf_path /path/filename (default: ~/.spamassassin/osbf) +##OSBF## +##OSBF##This is the directory and filename for Bayes databases. Several databases +##OSBF##will be created, with this as the base directory and filename, with C<_toks>, +##OSBF##C<_seen>, etc. appended to the base. The default setting results in files +##OSBF##called C<~/.spamassassin/osbf_seen>, C<~/.spamassassin/osbf_toks>, etc. +##OSBF## +##OSBF##By default, each user has their own in their C<~/.spamassassin> directory with +##OSBF##mode 0700/0600. For system-wide SpamAssassin use, you may want to reduce disk +##OSBF##space usage by sharing this across all users. However, Bayes appears to be +##OSBF##more effective with individual user databases. +##OSBF## +##OSBF##=cut +##OSBF## +##OSBF## push (@cmds, { +##OSBF## setting => 'osbf_path', +##OSBF## is_admin => 1, +##OSBF## default => '__userstate__/osbf', +##OSBF## code => sub { +##OSBF## my ($self, $key, $value, $line) = @_; +##OSBF## unless (defined $value && $value !~ /^$/) { +##OSBF## return $MISSING_REQUIRED_VALUE; +##OSBF## } +##OSBF## if (-d $value) { +##OSBF## return $INVALID_VALUE; +##OSBF## } +##OSBF## $self->{osbf_path} = $value; +##OSBF## } +##OSBF## }); +##OSBF## +##OSBF##=item osbf_file_mode (default: 0700) +##OSBF## +##OSBF##The file mode bits used for the Bayesian filtering database files. +##OSBF## +##OSBF##Make sure you specify this using the 'x' mode bits set, as it may also be used +##OSBF##to create directories. However, if a file is created, the resulting file will +##OSBF##not have any execute bits set (the umask is set to 111). The argument is a +##OSBF##string of octal digits, it is converted to a numeric value internally. +##OSBF## +##OSBF##=cut +##OSBF## +##OSBF## push (@cmds, { +##OSBF## setting => 'osbf_file_mode', +##OSBF## is_admin => 1, +##OSBF## default => '0700', +##OSBF## type => $CONF_TYPE_NUMERIC +##OSBF## }); +##OSBF## +##OSBF##=item osbf_store_module Name::Of::BayesStore::Module +##OSBF## +##OSBF##If this option is set, the module given will be used as an alternate +##OSBF##to the default osbf storage mechanism. It must conform to the +##OSBF##published storage specification (see +##OSBF##Mail::SpamAssassin::BayesStore). For example, set this to +##OSBF##Mail::SpamAssassin::BayesStore::SQL to use the generic SQL storage +##OSBF##module. +##OSBF## +##OSBF##=cut +##OSBF## +##OSBF## push (@cmds, { +##OSBF## setting => 'osbf_store_module', +##OSBF## is_admin => 1, +##OSBF## default => '', +##OSBF## code => sub { +##OSBF## my ($self, $key, $value, $line) = @_; +##OSBF## local ($1); +##OSBF## if ($value !~ /^([_A-Za-z0-9:]+)$/) { return $INVALID_VALUE; } +##OSBF## $self->{osbf_store_module} = $1; +##OSBF## } +##OSBF## }); + =item bayes_sql_dsn DBI::databasetype:databasename:hostname:port Used for BayesStore::SQL storage implementation. Index: lib/Mail/SpamAssassin/Bayes.pm =================================================================== --- lib/Mail/SpamAssassin/Bayes.pm (.../trunk) (revision 602913) +++ lib/Mail/SpamAssassin/Bayes.pm (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -17,31 +17,17 @@ =head1 NAME -Mail::SpamAssassin::Bayes - determine spammishness using a Bayesian classifier +Mail::SpamAssassin::Bayes - support for learning classifiers =head1 DESCRIPTION -This is a Bayesian-like form of probability-analysis classification, using an -algorithm based on the one detailed in Paul Graham's I paper -at: +This is the general class used to train a learning classifier with new samples +of spam and ham mail, and classify based on prior training. - http://www.paulgraham.com/spam.html +Prior to version 3.3.0, the default Bayes implementation was here; if you're +looking for information on that, it has moved to +C. -It also incorporates some other aspects taken from Graham Robinson's webpage -on the subject at: - - http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html - -And the chi-square probability combiner as described here: - - http://www.linuxjournal.com/print.php?sid=6467 - -The results are incorporated into SpamAssassin as the BAYES_* rules. - -=head1 METHODS - -=over 4 - =cut package Mail::SpamAssassin::Bayes; @@ -56,172 +42,8 @@ use Mail::SpamAssassin::Logger; use Mail::SpamAssassin::Util qw(untaint_var); -# pick ONLY ONE of these combining implementations. -use Mail::SpamAssassin::Bayes::CombineChi; -# use Mail::SpamAssassin::Bayes::CombineNaiveBayes; +our @ISA = qw(); -use Digest::SHA1 qw(sha1 sha1_hex); - -use vars qw{ - @ISA - $IGNORED_HDRS - $MARK_PRESENCE_ONLY_HDRS - %HEADER_NAME_COMPRESSION - $OPPORTUNISTIC_LOCK_VALID -}; - -@ISA = qw(); - -# Which headers should we scan for tokens? Don't use all of them, as it's easy -# to pick up spurious clues from some. What we now do is use all of them -# *less* these well-known headers; that way we can pick up spammers' tracking -# headers (which are obviously not well-known in advance!). - -# Received is handled specially -$IGNORED_HDRS = qr{(?: (?:X-)?Sender # misc noise - |Delivered-To |Delivery-Date - |(?:X-)?Envelope-To - |X-MIME-Auto[Cc]onverted |X-Converted-To-Plain-Text - - |Subject # not worth a tiny gain vs. to db size increase - - # Date: can provide invalid cues if your spam corpus is - # older/newer than ham - |Date - - # List headers: ignore. a spamfiltering mailing list will - # become a nonspam sign. - |X-List|(?:X-)?Mailing-List - |(?:X-)?List-(?:Archive|Help|Id|Owner|Post|Subscribe - |Unsubscribe|Host|Id|Manager|Admin|Comment - |Name|Url) - |X-Unsub(?:scribe)? - |X-Mailman-Version |X-Been[Tt]here |X-Loop - |Mail-Followup-To - |X-eGroups-(?:Return|From) - |X-MDMailing-List - |X-XEmacs-List - - # gatewayed through mailing list (thanks to Allen Smith) - |(?:X-)?Resent-(?:From|To|Date) - |(?:X-)?Original-(?:From|To|Date) - - # Spamfilter/virus-scanner headers: too easy to chain from - # these - |X-MailScanner(?:-SpamCheck)? - |X-Spam(?:-(?:Status|Level|Flag|Report|Hits|Score|Checker-Version))? - |X-Antispam |X-RBL-Warning |X-Mailscanner - |X-MDaemon-Deliver-To |X-Virus-Scanned - |X-Mass-Check-Id - |X-Pyzor |X-DCC-\S{2,25}-Metrics - |X-Filtered-B[Yy] |X-Scanned-By |X-Scanner - |X-AP-Spam-(?:Score|Status) |X-RIPE-Spam-Status - |X-SpamCop-[^:]+ - |X-SMTPD |(?:X-)?Spam-Apparently-To - |SPAM |X-Perlmx-Spam - |X-Bogosity - - # some noisy Outlook headers that add no good clues: - |Content-Class |Thread-(?:Index|Topic) - |X-Original[Aa]rrival[Tt]ime - - # Annotations from IMAP, POP, and MH: - |(?:X-)?Status |X-Flags |Replied |Forwarded - |Lines |Content-Length - |X-UIDL? |X-IMAPbase - - # Annotations from Bugzilla - |X-Bugzilla-[^:]+ - - # Annotations from VM: (thanks to Allen Smith) - |X-VM-(?:Bookmark|(?:POP|IMAP)-Retrieved|Labels|Last-Modified - |Summary-Format|VHeader|v\d-Data|Message-Order) - - # Annotations from Gnus: - | X-Gnus-Mail-Source - | Xref - -)}x; - -# Note only the presence of these headers, in order to reduce the -# hapaxen they generate. -$MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face - |X-(?:Gnu-?PG|PGP|GPG)(?:-Key)?-Fingerprint -)}ix; - -# tweaks tested as of Nov 18 2002 by jm: see SpamAssassin-devel list archives -# for results. The winners are now the default settings. -use constant IGNORE_TITLE_CASE => 1; -use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 1; -use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; - -# tweaks of May 12 2003, see SpamAssassin-devel archives again. -use constant PRE_CHEW_ADDR_HEADERS => 1; -use constant CHEW_BODY_URIS => 1; -use constant CHEW_BODY_MAILADDRS => 1; -use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; -use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; -use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0; -use constant IGNORE_MSGID_TOKENS => 0; - -# tweaks of 12 March 2004, see bug 2129. -use constant DECOMPOSE_BODY_TOKENS => 1; -use constant MAP_HEADERS_MID => 1; -use constant MAP_HEADERS_FROMTOCC => 1; -use constant MAP_HEADERS_USERAGENT => 1; - -# tweaks, see http://issues.apache.org/SpamAssassin/show_bug.cgi?id=3173#c26 -use constant ADD_INVIZ_TOKENS_I_PREFIX => 1; -use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0; - -# We store header-mined tokens in the db with a "HHeaderName:val" format. -# some headers may contain lots of gibberish tokens, so allow a little basic -# compression by mapping the header name at least here. these are the headers -# which appear with the most frequency in my db. note: this doesn't have to -# be 2-way (ie. LHSes that map to the same RHS are not a problem), but mixing -# tokens from multiple different headers may impact accuracy, so might as well -# avoid this if possible. These are the top ones from my corpus, BTW (jm). -%HEADER_NAME_COMPRESSION = ( - 'Message-Id' => '*m', - 'Message-ID' => '*M', - 'Received' => '*r', - 'User-Agent' => '*u', - 'References' => '*f', - 'In-Reply-To' => '*i', - 'From' => '*F', - 'Reply-To' => '*R', - 'Return-Path' => '*p', - 'Return-path' => '*rp', - 'X-Mailer' => '*x', - 'X-Authentication-Warning' => '*a', - 'Organization' => '*o', - 'Organisation' => '*o', - 'Content-Type' => '*c', - 'X-Spam-Relays-Trusted' => '*RT', - 'X-Spam-Relays-Untrusted' => '*RU', -); - -# How many seconds should the opportunistic_expire lock be valid? -$OPPORTUNISTIC_LOCK_VALID = 300; - -# Should we use the Robinson f(w) equation from -# http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html ? -# It gives better results, in that scores are more likely to distribute -# into the <0.5 range for nonspam and >0.5 for spam. -use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1; - -# How many of the most significant tokens should we use for the p(w) -# calculation? -use constant N_SIGNIFICANT_TOKENS => 150; - -# How many significant tokens are required for a classifier score to -# be considered usable? -use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1; - -# How long a token should we hold onto? (note: German speakers typically -# will require a longer token than English ones.) -use constant MAX_TOKEN_LENGTH => 15; - ########################################################################### sub new { @@ -232,450 +54,49 @@ my $self = { 'main' => $main, 'conf' => $main->{conf}, - ## 'log_raw_counts' => 0, # see compute_prob_for_token() 'use_ignores' => 1, }; bless ($self, $class); - if ($self->{conf}->{bayes_store_module}) { - my $module = $self->{conf}->{bayes_store_module}; - $module = untaint_var($module); # good enough? - my $store; - - eval ' - require '.$module.'; - $store = '.$module.'->new($self); - 1; - ' or do { - my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; - die "bayes: (in new) $eval_stat\n"; - }; - $self->{store} = $store; - } - else { - require Mail::SpamAssassin::BayesStore::DBM; - $self->{store} = Mail::SpamAssassin::BayesStore::DBM->new($self); - } - + $self->{main}->call_plugins("learner_new"); $self; } +########################################################################### + sub finish { my $self = shift; - #if (!$self->{conf}->{use_bayes}) { return; } - - # if we're untying too much, uncomment this... - # use Carp qw(cluck); cluck "stack trace at untie"; - - $self->{store}->untie_db(); + # we don't need to do the plugin; Mail::SpamAssassin::finish() does + # that for us %{$self} = (); } -sub sa_die { Mail::SpamAssassin::sa_die(@_); } - ########################################################################### -sub sanity_check_is_untied { +# force the Bayes dbs to be closed, if they haven't already been; called +# at the end of scan operation, or when switching between user IDs, +# or when C is called. +# +sub force_close { my $self = shift; my $quiet = shift; - - # do a sanity check here. Wierd things happen if we remain tied - # after compiling; for example, spamd will never see that the - # number of messages has reached the bayes-scanning threshold. - if ($self->{store}->db_readable()) { - warn "bayes: oops! still tied to bayes DBs, untying\n" unless $quiet; - $self->{store}->untie_db(); - } + $self->{main}->call_plugins("learner_close", { quiet => $quiet }); } ########################################################################### -# read configuration items to control bayes behaviour. Called by -# BayesStore::read_db_configs(). -sub read_db_configs { - my ($self) = @_; - - # use of hapaxes. Set on bayes object, since it controls prob - # computation. - $self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes}; -} - -########################################################################### - -# The calling functions expect a uniq'ed array of tokens ... -sub tokenize { - my ($self, $msg, $msgdata) = @_; - - # the body - my @tokens = map { $self->tokenize_line ($_, '', 1) } - @{$msgdata->{bayes_token_body}}; - - # the URI list - push (@tokens, map { $self->tokenize_line ($_, '', 2) } - @{$msgdata->{bayes_token_uris}}); - - # add invisible tokens - if (ADD_INVIZ_TOKENS_I_PREFIX) { - push (@tokens, map { $self->tokenize_line ($_, "I*:", 1) } - @{$msgdata->{bayes_token_inviz}}); - } - if (ADD_INVIZ_TOKENS_NO_PREFIX) { - push (@tokens, map { $self->tokenize_line ($_, "", 1) } - @{$msgdata->{bayes_token_inviz}}); - } - - # Tokenize the headers - my %hdrs = $self->tokenize_headers ($msg); - while( my($prefix, $value) = each %hdrs ) { - push(@tokens, $self->tokenize_line ($value, "H$prefix:", 0)); - } - - # Go ahead and uniq the array, skip null tokens (can happen sometimes) - # generate an SHA1 hash and take the lower 40 bits as our token - my %tokens; - foreach my $token (@tokens) { - next unless length($token); # skip 0 length tokens - $tokens{substr(sha1($token), -5)} = $token; - } - - # return the keys == tokens ... - return \%tokens; -} - -sub tokenize_line { - my $self = $_[0]; - my $tokprefix = $_[2]; - my $region = $_[3]; - local ($_) = $_[1]; - - my @rettokens; - - # include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings, - # and ISO-8859-15 alphas. Do not split on @'s; better results keeping it. - # Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!" - tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs; - - # DO split on "..." or "--" or "---"; common formatting error resulting in - # hapaxes. Keep the separator itself as a token, though, as long ones can - # be good spamsigns. - s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs; - s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs; - - if (IGNORE_TITLE_CASE) { - if ($region == 1 || $region == 2) { - # lower-case Title Case at start of a full-stop-delimited line (as would - # be seen in a Western language). - s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '. (lc $1) . $2 . ' ' /ge; - } - } - - my $magic_re = $self->{store}->get_magic_re(); - - foreach my $token (split) { - $token =~ s/^[-'"\.,]+//; # trim non-alphanum chars at start or end - $token =~ s/[-'"\.,]+$//; # so we don't get loads of '"foo' tokens - - # Skip false magic tokens - # TVD: we need to do a defined() check since SQL doesn't have magic - # tokens, so the SQL BayesStore returns undef. I really want a way - # of optimizing that out, but I haven't come up with anything yet. - # - next if ( defined $magic_re && $token =~ /$magic_re/ ); - - # *do* keep 3-byte tokens; there's some solid signs in there - my $len = length($token); - - # but extend the stop-list. These are squarely in the gray - # area, and it just slows us down to record them. - # See http://wiki.apache.org/spamassassin/BayesStopList for more info. - # - next if $len < 3 || - ($token =~ /^(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$/i); - - # are we in the body? If so, apply some body-specific breakouts - if ($region == 1 || $region == 2) { - if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) { - push (@rettokens, $self->tokenize_mail_addrs ($token)); - } - elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) { - push (@rettokens, "UD:".$token); # the full token - my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) { - push (@rettokens, "UD:".$1); # UD = URL domain - } - } - } - - # note: do not trim down overlong tokens if they contain '*'. This is - # used as part of split tokens such as "HTo:D*net" indicating that - # the domain ".net" appeared in the To header. - # - if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) { - if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) { - # Matt sez: "Could be asian? Autrijus suggested doing character ngrams, - # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan - # to me! (jm) - while ($token =~ s/^(..?)//) { - push (@rettokens, "8:$1"); - } - next; - } - - if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS) - || ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS) - || ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS)) - { - # if (TOKENIZE_LONG_TOKENS_AS_SKIPS) - # Spambayes trick via Matt: Just retain 7 chars. Do not retain - # the length, it does not help; see my mail to -devel of Nov 20 2002. - # "sk:" stands for "skip". - $token = "sk:".substr($token, 0, 7); - } - } - - # decompose tokens? do this after shortening long tokens - if ($region == 1 || $region == 2) { - if (DECOMPOSE_BODY_TOKENS) { - if ($token =~ /[^\w:\*]/) { - my $decompd = $token; # "Foo!" - $decompd =~ s/[^\w:\*]//gs; - push (@rettokens, $tokprefix.$decompd); # "Foo" - } - - if ($token =~ /[A-Z]/) { - my $decompd = $token; $decompd = lc $decompd; - push (@rettokens, $tokprefix.$decompd); # "foo!" - - if ($token =~ /[^\w:\*]/) { - $decompd =~ s/[^\w:\*]//gs; - push (@rettokens, $tokprefix.$decompd); # "foo" - } - } - } - } - - push (@rettokens, $tokprefix.$token); - } - - return @rettokens; -} - -sub tokenize_headers { - my ($self, $msg) = @_; - - my %parsed; - - my %user_ignore; - $user_ignore{lc $_} = 1 for @{$self->{main}->{conf}->{bayes_ignore_headers}}; - - # get headers in array context - my @hdrs; - my @rcvdlines; - for ($msg->get_all_headers()) { - # first, keep a copy of Received headers, so we can strip down to last 2 - if (/^Received:/i) { - push(@rcvdlines, $_); - next; - } - # and now skip lines for headers we don't want (including all Received) - next if /^${IGNORED_HDRS}:/i; - next if IGNORE_MSGID_TOKENS && /^Message-ID:/i; - push(@hdrs, $_); - } - push(@hdrs, $msg->get_all_metadata()); - - # and re-add the last 2 received lines: usually a good source of - # spamware tokens and HELO names. - if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); } - if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); } - - for (@hdrs) { - next unless /\S/; - my ($hdr, $val) = split(/:/, $_, 2); - - # remove user-specified headers here, after Received, in case they - # want to ignore that too - next if exists $user_ignore{lc $hdr}; - - # Prep the header value - $val ||= ''; - chomp($val); - - # special tokenization for some headers: - if ($hdr =~ /^(?:|X-|Resent-)Message-Id$/i) { - $val = $self->pre_chew_message_id ($val); - } - elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:|X-|Resent-) - (?:Return-Path|From|To|Cc|Reply-To|Errors-To|Mail-Followup-To|Sender)$/ix) - { - $val = $self->pre_chew_addr_header ($val); - } - elsif ($hdr eq 'Received') { - $val = $self->pre_chew_received ($val); - } - elsif ($hdr eq 'Content-Type') { - $val = $self->pre_chew_content_type ($val); - } - elsif ($hdr eq 'MIME-Version') { - $val =~ s/1\.0//; # totally innocuous - } - elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) { - $val = "1"; # just mark the presence, they create lots of hapaxen - } - - if (MAP_HEADERS_MID) { - if ($hdr =~ /^(?:In-Reply-To|References|Message-ID)$/i) { - $parsed{"*MI"} = $val; - } - } - if (MAP_HEADERS_FROMTOCC) { - if ($hdr =~ /^(?:From|To|Cc)$/i) { - $parsed{"*Ad"} = $val; - } - } - if (MAP_HEADERS_USERAGENT) { - if ($hdr =~ /^(?:X-Mailer|User-Agent)$/i) { - $parsed{"*UA"} = $val; - } - } - - # replace hdr name with "compressed" version if possible - if (defined $HEADER_NAME_COMPRESSION{$hdr}) { - $hdr = $HEADER_NAME_COMPRESSION{$hdr}; - } - - if (exists $parsed{$hdr}) { - $parsed{$hdr} .= " ".$val; - } else { - $parsed{$hdr} = $val; - } - if (would_log('dbg', 'bayes') > 1) { - dbg("bayes: header tokens for $hdr = \"$parsed{$hdr}\""); - } - } - - return %parsed; -} - -sub pre_chew_content_type { - my ($self, $val) = @_; - - # hopefully this will retain good bits without too many hapaxen - if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) { - my $boundary = $1; - $boundary =~ s/[a-fA-F0-9]/H/gs; - # break up blocks of separator chars so they become their own tokens - $boundary =~ s/([-_\.=]+)/ $1 /gs; - $val .= $boundary; - } - - # stop-list words for Content-Type header: these wind up totally gray - $val =~ s/\b(?:text|charset)\b//; - - $val; -} - -sub pre_chew_message_id { - my ($self, $val) = @_; - # we can (a) get rid of a lot of hapaxen and (b) increase the token - # specificity by pre-parsing some common formats. - - # Outlook Express format: - $val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$ - ([0-9a-f]{4})[0-9a-f]{4}\$ - ([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx; - - # Exim: - $val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//; - - # Sendmail: - $val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\. - [A-F0-9]{10,12}\@//gx; - - # try to split Message-ID segments on probable ID boundaries. Note that - # Outlook message-ids seem to contain a server identifier ID in the last - # 8 bytes before the @. Make sure this becomes its own token, it's a - # great spam-sign for a learning system! Be sure to split on ".". - $val =~ s/[^_A-Za-z0-9]/ /g; - $val; -} - -sub pre_chew_received { - my ($self, $val) = @_; - - # Thanks to Dan for these. Trim out "useless" tokens; sendmail-ish IDs - # and valid-format RFC-822/2822 dates - - $val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs; # Sendmail - $val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs; # Sendmail - $val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs; # Sendmail - $val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim - - $val =~ s/(?:(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)? - [0-3\s]?[0-9]\s - (?:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s - (?:19|20)?[0-9]{2}\s - [0-2][0-9](?:\:[0-5][0-9]){1,2}\s - (?:\s*\(|\)|\s*(?:[+-][0-9]{4})|\s*(?:UT|[A-Z]{2,3}T))* - //gx; - - # IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for - # IPs in the 10 and 192.168 ranges, they gets lots of significant tokens - # (on both sides) - # also make a dup with the full IP, as fodder for - # bayes_dump_to_trusted_networks: "H*r:ip*aaa.bbb.ccc.ddd" - $val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{ - if ($2 eq '10' || ($2 eq '192' && $3 eq '168')) { - $1.$2.$3.$4. - " ip*".$1.$2.$3.$4." "; - } else { - $1.$2.$3. - " ip*".$1.$2.$3.$4." "; - } - }gex; - - # trim these: they turn out as the most common tokens, but with a - # prob of about .5. waste of space! - $val =~ s/\b(?:with|from|for|SMTP|ESMTP)\b/ /g; - - $val; -} - -sub pre_chew_addr_header { - my ($self, $val) = @_; - local ($_); - - my @addrs = $self->{main}->find_all_addrs_in_line ($val); - my @toks; - foreach (@addrs) { - push (@toks, $self->tokenize_mail_addrs ($_)); - } - return join (' ', @toks); -} - -sub tokenize_mail_addrs { - my ($self, $addr) = @_; - - ($addr =~ /(.+)\@(.+)$/) or return (); - my @toks; - push(@toks, "U*".$1, "D*".$2); - $_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); } - return @toks; -} - -########################################################################### - sub ignore_message { my ($self,$PMS) = @_; return 0 unless $self->{use_ignores}; - my $ig_from = $self->{main}->call_plugins ("check_wb_list", { permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' }); - my $ig_to = $self->{main}->call_plugins ("check_wb_list", { permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' }); + my $ig_from = $self->{main}->call_plugins ("check_wb_list", + { permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' }); + my $ig_to = $self->{main}->call_plugins ("check_wb_list", + { permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' }); my $ignore = $ig_from || $ig_to; - dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore; - return $ignore; } @@ -683,10 +104,9 @@ sub learn { my ($self, $isspam, $msg, $id) = @_; + return unless $self->{conf}->{use_learner}; + return unless defined $msg; - if (!$self->{conf}->{use_bayes}) { return; } - if (!defined $msg) { return; } - if( $self->{use_ignores} ) # Remove test when PerMsgStatus available. { # DMK, koppel@ece.lsu.edu: Hoping that the ultimate fix to bug 2263 will @@ -697,321 +117,29 @@ return 0 if $ignore; } - my $msgdata = $self->get_body_from_msg ($msg); - my $ret; - - eval { - local $SIG{'__DIE__'}; # do not run user die() traps in here - - my $ok; - if ($self->{main}->{learn_to_journal}) { - # If we're going to learn to journal, we'll try going r/o first... - # If that fails for some reason, let's try going r/w. This happens - # if the DB doesn't exist yet. - $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable(); - } else { - $ok = $self->{store}->tie_db_writable(); - } - - if ($ok) { - $ret = $self->learn_trapped ($isspam, $msg, $msgdata, $id); - - if (!$self->{main}->{learn_caller_will_untie}) { - $self->{store}->untie_db(); - } - } - 1; - } or do { # if we died, untie the dbs. - my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; - $self->{store}->untie_db(); - die "bayes: (in learn) $eval_stat\n"; - }; - - return $ret; + return $self->{main}->call_plugins("learn_message", { isspam => $isspam, msg => $msg, id => $id }); } -# this function is trapped by the wrapper above -sub learn_trapped { - my ($self, $isspam, $msg, $msgdata, $msgid) = @_; - my @msgid = ( $msgid ); - - if (!defined $msgid) { - @msgid = $self->get_msgid($msg); - } - - foreach $msgid ( @msgid ) { - my $seen = $self->{store}->seen_get ($msgid); - - if (defined ($seen)) { - if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) { - dbg("bayes: $msgid already learnt correctly, not learning twice"); - return 0; - } elsif ($seen !~ /^[hs]$/) { - warn("bayes: db_seen corrupt: value='$seen' for $msgid, ignored"); - } else { - # bug 3704: If the message was already learned, don't try learning it again. - # this prevents, for instance, manually learning as spam, then autolearning - # as ham, or visa versa. - if ($self->{main}->{learn_no_relearn}) { - dbg("bayes: $msgid already learnt as opposite, not re-learning"); - return 0; - } - - dbg("bayes: $msgid already learnt as opposite, forgetting first"); - - # kluge so that forget() won't untie the db on us ... - my $orig = $self->{main}->{learn_caller_will_untie}; - $self->{main}->{learn_caller_will_untie} = 1; - - my $fatal = !defined $self->forget ($msg); - - # reset the value post-forget() ... - $self->{main}->{learn_caller_will_untie} = $orig; - - # forget() gave us a fatal error, so propagate that up - if ($fatal) { - dbg("bayes: forget() returned a fatal error, so learn() will too"); - return; - } - } - - # we're only going to have seen this once, so stop if it's been - # seen already - last; - } - } - - # Now that we're sure we haven't seen this message before ... - $msgid = $msgid[0]; - - if ($isspam) { - $self->{store}->nspam_nham_change (1, 0); - } else { - $self->{store}->nspam_nham_change (0, 1); - } - - my $msgatime = $msg->receive_date(); - - # If the message atime comes back as being more than 1 day in the - # future, something's messed up and we should revert to current time as - # a safety measure. - # - $msgatime = time if ( $msgatime - time > 86400 ); - - my $tokens = $self->tokenize($msg, $msgdata); - - if ($isspam) { - $self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime); - } else { - $self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime); - } - - $self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h')); - $self->{store}->cleanup(); - - $self->{main}->call_plugins("bayes_learn", { toksref => $tokens, - isspam => $isspam, - msgid => $msgid, - msgatime => $msgatime, - }); - - dbg("bayes: learned '$msgid', atime: $msgatime"); - - 1; -} - ########################################################################### sub forget { my ($self, $msg, $id) = @_; - - if (!$self->{conf}->{use_bayes}) { return; } - if (!defined $msg) { return; } - - my $msgdata = $self->get_body_from_msg ($msg); - my $ret; - - # we still tie for writing here, since we write to the seen db - # synchronously - eval { - local $SIG{'__DIE__'}; # do not run user die() traps in here - - my $ok; - if ($self->{main}->{learn_to_journal}) { - # If we're going to learn to journal, we'll try going r/o first... - # If that fails for some reason, let's try going r/w. This happens - # if the DB doesn't exist yet. - $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable(); - } else { - $ok = $self->{store}->tie_db_writable(); - } - - if ($ok) { - $ret = $self->forget_trapped ($msg, $msgdata, $id); - - if (!$self->{main}->{learn_caller_will_untie}) { - $self->{store}->untie_db(); - } - } - 1; - } or do { # if we died, untie the dbs. - my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; - $self->{store}->untie_db(); - die "bayes: (in forget) $eval_stat\n"; - }; - - return $ret; + return unless $self->{conf}->{use_learner}; + return unless defined $msg; + return $self->{main}->call_plugins("forget_message", { msg => $msg, id => $id }); } -# this function is trapped by the wrapper above -sub forget_trapped { - my ($self, $msg, $msgdata, $msgid) = @_; - my @msgid = ( $msgid ); - my $isspam; - - if (!defined $msgid) { - @msgid = $self->get_msgid($msg); - } - - while( $msgid = shift @msgid ) { - my $seen = $self->{store}->seen_get ($msgid); - - if (defined ($seen)) { - if ($seen eq 's') { - $isspam = 1; - } elsif ($seen eq 'h') { - $isspam = 0; - } else { - dbg("bayes: forget: msgid $msgid seen entry is neither ham nor spam, ignored"); - return 0; - } - - # messages should only be learned once, so stop if we find a msgid - # which was seen before - last; - } - else { - dbg("bayes: forget: msgid $msgid not learnt, ignored"); - } - } - - # This message wasn't learnt before, so return - if (!defined $isspam) { - dbg("bayes: forget: no msgid from this message has been learnt, skipping message"); - return 0; - } - elsif ($isspam) { - $self->{store}->nspam_nham_change (-1, 0); - } - else { - $self->{store}->nspam_nham_change (0, -1); - } - - my $tokens = $self->tokenize($msg, $msgdata); - - if ($isspam) { - $self->{store}->multi_tok_count_change (-1, 0, $tokens); - } else { - $self->{store}->multi_tok_count_change (0, -1, $tokens); - } - - $self->{store}->seen_delete ($msgid); - $self->{store}->cleanup(); - - $self->{main}->call_plugins("bayes_forget", { toksref => $tokens, - isspam => $isspam, - msgid => $msgid, - }); - - 1; -} - ########################################################################### -sub get_msgid { - my ($self, $msg) = @_; - - my @msgid; - - my $msgid = $msg->get_header("Message-Id"); - if (defined $msgid && $msgid ne '' && $msgid !~ /^\s*<\s*(?:\@sa_generated)?>.*$/) { - # remove \r and < and > prefix/suffixes - chomp $msgid; - $msgid =~ s/^.*$//g; - push(@msgid, $msgid); - } - - # Use sha1_hex(Date:, last received: and top N bytes of body) - # where N is MIN(1024 bytes, 1/2 of body length) - # - my $date = $msg->get_header("Date"); - $date = "None" if (!defined $date || $date eq ''); # No Date? - - my @rcvd = $msg->get_header("Received"); - my $rcvd = $rcvd[$#rcvd]; - $rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received? - - # Make a copy since pristine_body is a reference ... - my $body = join('', $msg->get_pristine_body()); - if (length($body) > 64) { # Small Body? - my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) ); - substr($body, $keep) = ''; - } - - unshift(@msgid, sha1_hex($date."\000".$rcvd."\000".$body).'@sa_generated'); - - return wantarray ? @msgid : $msgid[0]; -} - -sub get_body_from_msg { - my ($self, $msg) = @_; - - if (!ref $msg) { - # I have no idea why this seems to happen. TODO - warn "bayes: msg not a ref: '$msg'"; - return { }; - } - - my $permsgstatus = - Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg); - $msg->extract_message_metadata ($permsgstatus); - my $msgdata = $self->get_msgdata_from_permsgstatus ($permsgstatus); - $permsgstatus->finish(); - - if (!defined $msgdata) { - # why?! - warn "bayes: failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n"; - return { }; - } - - return $msgdata; -} - -sub get_msgdata_from_permsgstatus { - my ($self, $msg) = @_; - - my $msgdata = { }; - $msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array(); - $msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array(); - @{$msgdata->{bayes_token_uris}} = $msg->get_uri_list(); - return $msgdata; -} - -########################################################################### - sub sync { my ($self, $sync, $expire, $opts) = @_; - if (!$self->{conf}->{use_bayes}) { return 0; } + return 0 unless $self->{conf}->{use_learner}; if ($sync) { - dbg("bayes: bayes journal sync starting"); - $self->{store}->sync($opts); - dbg("bayes: bayes journal sync completed"); + $self->{main}->call_plugins("learner_sync", $opts ); } if ($expire) { - dbg("bayes: expiry starting"); - $self->{store}->expire_old_tokens($opts); - dbg("bayes: expiry completed"); + $self->{main}->call_plugins("learner_expire_old_training", $opts ); } return 0; @@ -1019,408 +147,21 @@ ########################################################################### -# compute the probability that a token is spammish -sub compute_prob_for_token { - my ($self, $token, $ns, $nn, $s, $n) = @_; - - # we allow the caller to give us the token information, just - # to save a potentially expensive lookup - if (!defined($s) || !defined($n)) { - ($s, $n, undef) = $self->{store}->tok_get ($token); - } - - return if ($s == 0 && $n == 0); - - if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) { - return if ($s + $n < 10); # ignore low-freq tokens - } - - if (!$self->{use_hapaxes}) { - return if ($s + $n < 2); - } - - return if ( $ns == 0 || $nn == 0 ); - - my $ratios = ($s / $ns); - my $ration = ($n / $nn); - - my $prob; - - if ($ratios == 0 && $ration == 0) { - warn "bayes: oops? ratios == ration == 0"; - return; - } else { - $prob = ($ratios) / ($ration + $ratios); - } - - if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) { - # use Robinson's f(x) equation for low-n tokens, instead of just - # ignoring them - my $robn = $s+$n; - $prob = ($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob)) - / - ($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn); - } - - # 'log_raw_counts' is used to log the raw data for the Bayes equations during - # a mass-check, allowing the S and X constants to be optimized quickly - # without requiring re-tokenization of the messages for each attempt. There's - # really no need for this code to be uncommented in normal use, however. It - # has never been publicly documented, so commenting it out is fine. ;) - - ## if ($self->{log_raw_counts}) { - ## $self->{raw_counts} .= " s=$s,n=$n "; - ## } - - return $prob; -} - -########################################################################### -# If a token is neither hammy nor spammy, return 0. -# For a spammy token, return the minimum number of additional ham messages -# it would have had to appear in to no longer be spammy. Hammy tokens -# are handled similarly. That's what the function does (at the time -# of this writing, 31 July 2003, 16:02:55 CDT). It would be slightly -# more useful if it returned the number of /additional/ ham messages -# a spammy token would have to appear in to no longer be spammy but I -# fear that might require the solution to a cubic equation, and I -# just don't have the time for that now. - -sub compute_declassification_distance { - my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_; - - return 0 if $ns == 0 && $nn == 0; - - if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);} - if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);} - - return 0 if $Ns == 0 || $Nn == 0; - return 0 if abs( $prob - 0.5 ) < - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH; - - my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn); - my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH; - - return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na - unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS; - - my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT; - my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X; - my $a = $Nb * ( 1 - $p ); - my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb; - my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) ); - my $discrim = $b * $b - 4 * $a * $c; - my $disc_max_0 = $discrim < 0 ? 0 : $discrim; - my $dd_exact = ( 1.0 - 1e-6 + ( -$b + sqrt( $disc_max_0 ) ) / ( 2*$a ) ) - $na; - - # This shouldn't be necessary. Should not be < 1 - return $dd_exact < 1 ? 1 : int($dd_exact); -} - - -# Check to make sure we can tie() the DB, and we have enough entries to do a scan -# if we're told the caller will untie(), go ahead and leave the db tied. sub is_scan_available { my $self = shift; - - return 0 unless $self->{conf}->{use_bayes}; - return 0 unless $self->{store}->tie_db_readonly(); - - # We need the DB to stay tied, so if the journal sync occurs, don't untie! - my $caller_untie = $self->{main}->{learn_caller_will_untie}; - $self->{main}->{learn_caller_will_untie} = 1; - - # Do a journal sync if necessary. Do this before the nspam_nham_get() - # call since the sync may cause an update in the number of messages - # learnt. - $self->opportunistic_calls(1); - - # Reset the variable appropriately - $self->{main}->{learn_caller_will_untie} = $caller_untie; - - my ($ns, $nn) = $self->{store}->nspam_nham_get(); - - if ($ns < $self->{conf}->{bayes_min_spam_num}) { - dbg("bayes: not available for scanning, only $ns spam(s) in bayes DB < ".$self->{conf}->{bayes_min_spam_num}); - if (!$self->{main}->{learn_caller_will_untie}) { - $self->{store}->untie_db(); - } - return 0; - } - if ($nn < $self->{conf}->{bayes_min_ham_num}) { - dbg("bayes: not available for scanning, only $nn ham(s) in bayes DB < ".$self->{conf}->{bayes_min_ham_num}); - if (!$self->{main}->{learn_caller_will_untie}) { - $self->{store}->untie_db(); - } - return 0; - } - - return 1; + return 0 unless $self->{conf}->{use_learner}; + return $self->{main}->call_plugins("learner_is_scan_available"); } ########################################################################### -# Finally, the scoring function for testing mail. -sub scan { - my ($self, $permsgstatus, $msg) = @_; - my $score; - - # When we're doing a scan, we'll guarantee that we'll do the untie, - # so override the global setting until we're done. - my $caller_untie = $self->{main}->{learn_caller_will_untie}; - $self->{main}->{learn_caller_will_untie} = 1; - - goto skip if ($self->ignore_message($permsgstatus)); - - goto skip unless $self->is_scan_available(); - - my ($ns, $nn) = $self->{store}->nspam_nham_get(); - - ## if ($self->{log_raw_counts}) { # see compute_prob_for_token() - ## $self->{raw_counts} = " ns=$ns nn=$nn "; - ## } - - dbg("bayes: corpus size: nspam = $ns, nham = $nn"); - - my $msgdata = $self->get_msgdata_from_permsgstatus ($permsgstatus); - - my $msgtokens = $self->tokenize($msg, $msgdata); - - my $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens}); - - my %pw; - - foreach my $tokendata (@{$tokensdata}) { - my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata}; - my $prob = $self->compute_prob_for_token($token, $ns, $nn, $tok_spam, $tok_ham); - next unless defined $prob; - - $pw{$token} = { - prob => $prob, - spam_count => $tok_spam, - ham_count => $tok_ham, - atime => $atime - }; - } - - # If none of the tokens were found in the DB, we're going to skip - # this message... - if (!keys %pw) { - dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database"); - goto skip; - } - - my $tcount_total = keys %{$msgtokens}; - my $tcount_learned = keys %pw; - - # Figure out the message receive time (used as atime below) - # If the message atime comes back as being in the future, something's - # messed up and we should revert to current time as a safety measure. - # - my $msgatime = $msg->receive_date(); - my $now = time; - $msgatime = $now if ( $msgatime > $now ); - - # now take the $count most significant tokens and calculate probs using - # Robinson's formula. - my $count = N_SIGNIFICANT_TOKENS; - my @sorted; - - my @touch_tokens; - my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = []; - my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = []; - - my %tok_strength = map { $_ => (abs($pw{$_}->{prob} - 0.5)) } keys %pw; - my $log_each_token = (would_log('dbg', 'bayes') > 1); - - foreach my $tok (sort { - $tok_strength{$b} <=> $tok_strength{$a} - } keys %pw) - { - if ($count-- < 0) { last; } - next if ($tok_strength{$tok} < - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH); - - my $pw = $pw{$tok}->{prob}; - - # What's more expensive, scanning headers for HAMMYTOKENS and - # SPAMMYTOKENS tags that aren't there or collecting data that - # won't be used? Just collecting the data is certainly simpler. - # - my $raw_token = $msgtokens->{$tok} || "(unknown)"; - my $s = $pw{$tok}->{spam_count}; - my $n = $pw{$tok}->{ham_count}; - my $a = $pw{$tok}->{atime}; - - if ($pw < 0.5) { - push @$tinfo_hammy, [$raw_token,$pw,$s,$n,$a]; - } else { - push @$tinfo_spammy, [$raw_token,$pw,$s,$n,$a]; - } - - push (@sorted, $pw); - - # update the atime on this token, it proved useful - push(@touch_tokens, $tok); - - if ($log_each_token) { - dbg("bayes: token '$raw_token' => $pw"); - } - } - - if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 && - $#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE)) - { - dbg("bayes: cannot use bayes on this message; not enough usable tokens found"); - goto skip; - } - - $score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted); - - # Couldn't come up with a probability? - goto skip unless defined $score; - - dbg("bayes: score = $score"); - - # no need to call tok_touch_all unless there were significant - # tokens and a score was returned - # we don't really care about the return value here - $self->{store}->tok_touch_all(\@touch_tokens, $msgatime); - - $permsgstatus->{bayes_nspam} = $ns; - $permsgstatus->{bayes_nham} = $nn; - - ## if ($self->{log_raw_counts}) { # see compute_prob_for_token() - ## print "#Bayes-Raw-Counts: $self->{raw_counts}\n"; - ## } - - $self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens, - probsref => \%pw, - score => $score, - msgatime => $msgatime, - significant_tokens => \@touch_tokens, - }); - -skip: - if (!defined $score) { - dbg("bayes: not scoring message, returning undef"); - } - - # Take any opportunistic actions we can take - if ($self->{main}->{opportunistic_expire_check_only}) { - # we're supposed to report on expiry only -- so do the - # opportunistic_calls() run for the journal only. - $self->opportunistic_calls(1); - $permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due(); - } - else { - $self->opportunistic_calls(); - } - - # Do any cleanup we need to do - $self->{store}->cleanup(); - - # Reset the value accordingly - $self->{main}->{learn_caller_will_untie} = $caller_untie; - - # If our caller won't untie the db, we need to do it. - if (!$caller_untie) { - $self->{store}->untie_db(); - } - - $permsgstatus->{tag_data}{BAYESTCHAMMY} = - ($tinfo_hammy ? scalar @{$tinfo_hammy} : 0); - $permsgstatus->{tag_data}{BAYESTCSPAMMY} = - ($tinfo_spammy ? scalar @{$tinfo_spammy} : 0); - $permsgstatus->{tag_data}{BAYESTCLEARNED} = $tcount_learned; - $permsgstatus->{tag_data}{BAYESTC} = $tcount_total; - - return $score; -} - -sub opportunistic_calls { - my($self, $journal_only) = @_; - - # If we're not already tied, abort. - if (!$self->{store}->db_readable()) { - dbg("bayes: opportunistic call attempt failed, DB not readable"); - return; - } - - # Is an expire or sync running? - my $running_expire = $self->{store}->get_running_expire_tok(); - if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) { - dbg("bayes: opportunistic call attempt skipped, found fresh running expire magic token"); - return; - } - - # handle expiry and syncing - if (!$journal_only && $self->{store}->expiry_due()) { - dbg("bayes: opportunistic call found expiry due"); - - # sync will bring the DB R/W as necessary, and the expire will remove - # the running_expire token, may untie as well. - $self->sync(1,1); - } - elsif ( $self->{store}->sync_due() ) { - dbg("bayes: opportunistic call found journal sync due"); - - # sync will bring the DB R/W as necessary, may untie as well - $self->sync(1,0); - - # We can only remove the running_expire token if we're doing R/W - if ($self->{store}->db_writable()) { - $self->{store}->remove_running_expire_tok(); - } - } - - return; -} - -########################################################################### - sub dump_bayes_db { my($self, $magic, $toks, $regex) = @_; - - # allow dump to occur even if use_bayes disables everything else ... - #return 0 unless $self->{conf}->{use_bayes}; - return 0 unless $self->{store}->tie_db_readonly(); - - my @vars = $self->{store}->get_storage_variables(); - - my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars; - - my $template = '%3.3f %10u %10u %10u %s'."\n"; - - if ( $magic ) { - printf ($template, 0.0, 0, $bv, 0, 'non-token data: bayes db version'); - printf ($template, 0.0, 0, $ns, 0, 'non-token data: nspam'); - printf ($template, 0.0, 0, $nh, 0, 'non-token data: nham'); - printf ($template, 0.0, 0, $nt, 0, 'non-token data: ntokens'); - printf ($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime'); - printf ($template, 0.0, 0, $na, 0, 'non-token data: newest atime') if ( $bv >= 2 ); - printf ($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count') if ( $bv < 2 ); - printf ($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime') if ( $bv >= 2 ); - printf ($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime'); - if ( $bv >= 2 ) { - printf ($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta'); - printf ($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count'); - } - } - - if ( $toks ) { - # let the store sort out the db_toks - $self->{store}->dump_db_toks($template, $regex, @vars); - } - - if (!$self->{main}->{learn_caller_will_untie}) { - $self->{store}->untie_db(); - } - return 1; + return 0 unless $self->{conf}->{use_learner}; + return $self->{main}->call_plugins("learner_dump_database", { + magic => $magic, toks => $toks, regex => $regex }); } 1; -=back - =cut Index: rules/23_bayes.cf =================================================================== --- rules/23_bayes.cf (.../trunk) (revision 602913) +++ rules/23_bayes.cf (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -53,14 +53,14 @@ tflags BAYES_95 learn tflags BAYES_99 learn -describe BAYES_00 Bayesian spam probability is 0 to 1% -describe BAYES_05 Bayesian spam probability is 1 to 5% -describe BAYES_20 Bayesian spam probability is 5 to 20% -describe BAYES_40 Bayesian spam probability is 20 to 40% -describe BAYES_50 Bayesian spam probability is 40 to 60% -describe BAYES_60 Bayesian spam probability is 60 to 80% -describe BAYES_80 Bayesian spam probability is 80 to 95% -describe BAYES_95 Bayesian spam probability is 95 to 99% -describe BAYES_99 Bayesian spam probability is 99 to 100% +describe BAYES_00 Bayes spam probability is 0 to 1% +describe BAYES_05 Bayes spam probability is 1 to 5% +describe BAYES_20 Bayes spam probability is 5 to 20% +describe BAYES_40 Bayes spam probability is 20 to 40% +describe BAYES_50 Bayes spam probability is 40 to 60% +describe BAYES_60 Bayes spam probability is 60 to 80% +describe BAYES_80 Bayes spam probability is 80 to 95% +describe BAYES_95 Bayes spam probability is 95 to 99% +describe BAYES_99 Bayes spam probability is 99 to 100% endif Index: t/bayessdbm.t =================================================================== --- t/bayessdbm.t (.../trunk) (revision 602913) +++ t/bayessdbm.t (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -34,8 +34,12 @@ ok($sa); -ok($sa->{bayes_scanner}); +sub getimpl { + return $sa->call_plugins("learner_get_implementation"); +} +ok($sa->{bayes_scanner} && getimpl); + ok(!$sa->{bayes_scanner}->is_scan_available()); open(MAIL,"< data/spam/001"); @@ -52,49 +56,49 @@ ok($mail); -my $body = $sa->{bayes_scanner}->get_body_from_msg($mail); +my $body = getimpl->get_body_from_msg($mail); ok($body); -my $toks = $sa->{bayes_scanner}->tokenize($mail, $body); +my $toks = getimpl->tokenize($mail, $body); ok(scalar(keys %{$toks}) > 0); -my($msgid,$msgid_hdr) = $sa->{bayes_scanner}->get_msgid($mail); +my($msgid,$msgid_hdr) = getimpl->get_msgid($mail); # $msgid is the generated hash messageid # $msgid_hdr is the Message-Id header ok($msgid eq 'ce33e4a8bc5798c65428d6018380bae346c7c126@sa_generated'); ok($msgid_hdr eq '9PS291LhupY'); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid)); +ok(!getimpl->{store}->seen_get($msgid)); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); ok($sa->{bayes_scanner}->learn(1, $mail)); ok(!$sa->{bayes_scanner}->learn(1, $mail)); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 's'); +ok(getimpl->{store}->seen_get($msgid) eq 's'); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); my $tokerror = 0; foreach my $tok (keys %{$toks}) { - my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok); + my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok); if ($spam == 0 || $ham > 0) { $tokerror = 1; } } ok(!$tokerror); -my $tokens = $sa->{bayes_scanner}->{store}->tok_get_all(keys %{$toks}); +my $tokens = getimpl->{store}->tok_get_all(keys %{$toks}); ok($tokens); @@ -107,36 +111,36 @@ } ok(!$tokerror); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); ok($sa->{bayes_scanner}->learn(0, $mail)); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 'h'); +ok(getimpl->{store}->seen_get($msgid) eq 'h'); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); $tokerror = 0; foreach my $tok (keys %{$toks}) { - my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok); + my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok); if ($spam > 0 || $ham == 0) { $tokerror = 1; } } ok(!$tokerror); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); ok($sa->{bayes_scanner}->forget($mail)); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid)); +ok(!getimpl->{store}->seen_get($msgid)); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); undef $sa; @@ -217,13 +221,13 @@ $mail = $sa->parse( $raw_message ); -$body = $sa->{bayes_scanner}->get_body_from_msg($mail); +$body = getimpl->get_body_from_msg($mail); my $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail); ok($msgstatus); -my $score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body); +my $score = getimpl->scan($msgstatus, $mail, $body); # Pretty much we can't count on the data returned with such little training # so just make sure that the score wasn't equal to .5 which is the default @@ -242,11 +246,11 @@ $mail = $sa->parse( $raw_message ); -$body = $sa->{bayes_scanner}->get_body_from_msg($mail); +$body = getimpl->get_body_from_msg($mail); $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail); -$score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body); +$score = getimpl->scan($msgstatus, $mail, $body); # Pretty much we can't count on the data returned with such little training # so just make sure that the score wasn't equal to .5 which is the default @@ -256,7 +260,7 @@ } -ok($sa->{bayes_scanner}->{store}->clear_database()); +ok(getimpl->{store}->clear_database()); ok(!-e 'log/user_state/bayes_journal'); ok(!-e 'log/user_state/bayes_seen.pag'); Index: t/bayessql.t =================================================================== --- t/bayessql.t (.../trunk) (revision 602913) +++ t/bayessql.t (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -60,17 +60,21 @@ ok($sa); -ok($sa->{bayes_scanner}); +sub getimpl { + return $sa->call_plugins("learner_get_implementation"); +} -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok($sa->{bayes_scanner} && getimpl); +ok(getimpl->{store}->tie_db_writable()); + # This bit breaks abstraction a bit, the userid is an implementation detail, # but is necessary to perform some of the tests. Perhaps in the future we # can add some sort of official API for this sort of thing. -my $testuserid = $sa->{bayes_scanner}->{store}->{_userid}; +my $testuserid = getimpl->{store}->{_userid}; ok(defined($testuserid)); -ok($sa->{bayes_scanner}->{store}->clear_database()); +ok(getimpl->{store}->clear_database()); ok(database_clear_p($testuser, $testuserid)); @@ -95,7 +99,7 @@ ok($sa->{bayes_scanner}); -ok(!$sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(!getimpl->{store}->tie_db_writable()); $sa->finish_learner(); @@ -138,49 +142,49 @@ ok($mail); -my $body = $sa->{bayes_scanner}->get_body_from_msg($mail); +my $body = getimpl->get_body_from_msg($mail); ok($body); -my $toks = $sa->{bayes_scanner}->tokenize($mail, $body); +my $toks = getimpl->tokenize($mail, $body); ok(scalar(keys %{$toks}) > 0); -my($msgid,$msgid_hdr) = $sa->{bayes_scanner}->get_msgid($mail); +my($msgid,$msgid_hdr) = getimpl->get_msgid($mail); # $msgid is the generated hash messageid # $msgid_hdr is the Message-Id header ok($msgid eq 'ce33e4a8bc5798c65428d6018380bae346c7c126@sa_generated'); ok($msgid_hdr eq '9PS291LhupY'); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid)); +ok(!getimpl->{store}->seen_get($msgid)); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); ok($sa->{bayes_scanner}->learn(1, $mail)); ok(!$sa->{bayes_scanner}->learn(1, $mail)); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 's'); +ok(getimpl->{store}->seen_get($msgid) eq 's'); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); my $tokerror = 0; foreach my $tok (keys %{$toks}) { - my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok); + my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok); if ($spam == 0 || $ham > 0) { $tokerror = 1; } } ok(!$tokerror); -my $tokens = $sa->{bayes_scanner}->{store}->tok_get_all(keys %{$toks}); +my $tokens = getimpl->{store}->tok_get_all(keys %{$toks}); ok($tokens); @@ -194,44 +198,44 @@ ok(!$tokerror); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); ok($sa->{bayes_scanner}->learn(0, $mail)); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 'h'); +ok(getimpl->{store}->seen_get($msgid) eq 'h'); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); $tokerror = 0; foreach my $tok (keys %{$toks}) { - my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok); + my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok); if ($spam > 0 || $ham == 0) { $tokerror = 1; } } ok(!$tokerror); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); ok($sa->{bayes_scanner}->forget($mail)); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid)); +ok(!getimpl->{store}->seen_get($msgid)); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); # This bit breaks abstraction a bit, the userid is an implementation detail, # but is necessary to perform some of the tests. Perhaps in the future we # can add some sort of official API for this sort of thing. -$testuserid = $sa->{bayes_scanner}->{store}->{_userid}; +$testuserid = getimpl->{store}->{_userid}; ok(defined($testuserid)); -ok($sa->{bayes_scanner}->{store}->clear_database()); +ok(getimpl->{store}->clear_database()); ok(database_clear_p($testuser, $testuserid)); @@ -293,13 +297,13 @@ $mail = $sa->parse( \@msg ); -$body = $sa->{bayes_scanner}->get_body_from_msg($mail); +$body = getimpl->get_body_from_msg($mail); my $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail); ok($msgstatus); -my $score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body); +my $score = getimpl->scan($msgstatus, $mail, $body); # Pretty much we can't count on the data returned with such little training # so just make sure that the score wasn't equal to .5 which is the default @@ -324,11 +328,11 @@ $mail = $sa->parse( \@msg ); -$body = $sa->{bayes_scanner}->get_body_from_msg($mail); +$body = getimpl->get_body_from_msg($mail); $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail); -$score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body); +$score = getimpl->scan($msgstatus, $mail, $body); # Pretty much we can't count on the data returned with such little training # so just make sure that the score wasn't equal to .5 which is the default @@ -340,10 +344,10 @@ # This bit breaks abstraction a bit, the userid is an implementation detail, # but is necessary to perform some of the tests. Perhaps in the future we # can add some sort of official API for this sort of thing. -$testuserid = $sa->{bayes_scanner}->{store}->{_userid}; +$testuserid = getimpl->{store}->{_userid}; ok(defined($testuserid)); -ok($sa->{bayes_scanner}->{store}->clear_database()); +ok(getimpl->{store}->clear_database()); ok(database_clear_p($testuser, $testuserid)); Index: t/bayesdbm.t =================================================================== --- t/bayesdbm.t (.../trunk) (revision 602913) +++ t/bayesdbm.t (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -31,9 +31,12 @@ $sa->init(); +sub getimpl { + return $sa->call_plugins("learner_get_implementation"); +} ok($sa); -ok($sa->{bayes_scanner}); +ok ($sa->{bayes_scanner} && getimpl); ok(!$sa->{bayes_scanner}->is_scan_available()); @@ -51,15 +54,15 @@ ok($mail); -my $body = $sa->{bayes_scanner}->get_body_from_msg($mail); +my $body = getimpl->get_body_from_msg($mail); ok($body); -my $toks = $sa->{bayes_scanner}->tokenize($mail, $body); +my $toks = getimpl->tokenize($mail, $body); ok(scalar(keys %{$toks}) > 0); -my($msgid,$msgid_hdr) = $sa->{bayes_scanner}->get_msgid($mail); +my($msgid,$msgid_hdr) = getimpl->get_msgid($mail); # $msgid is the generated hash messageid # $msgid_hdr is the Message-Id header @@ -67,34 +70,34 @@ or warn "got: [$msgid]"; ok($msgid_hdr eq '9PS291LhupY'); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid)); +ok(!getimpl->{store}->seen_get($msgid)); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); ok($sa->{bayes_scanner}->learn(1, $mail)); ok(!$sa->{bayes_scanner}->learn(1, $mail)); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 's'); +ok(getimpl->{store}->seen_get($msgid) eq 's'); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); my $tokerror = 0; foreach my $tok (keys %{$toks}) { - my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok); + my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok); if ($spam == 0 || $ham > 0) { $tokerror = 1; } } ok(!$tokerror); -my $tokens = $sa->{bayes_scanner}->{store}->tok_get_all(keys %{$toks}); +my $tokens = getimpl->{store}->tok_get_all(keys %{$toks}); ok($tokens); @@ -107,36 +110,36 @@ } ok(!$tokerror); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); ok($sa->{bayes_scanner}->learn(0, $mail)); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 'h'); +ok(getimpl->{store}->seen_get($msgid) eq 'h'); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); $tokerror = 0; foreach my $tok (keys %{$toks}) { - my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok); + my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok); if ($spam > 0 || $ham == 0) { $tokerror = 1; } } ok(!$tokerror); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); ok($sa->{bayes_scanner}->forget($mail)); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid)); +ok(!getimpl->{store}->seen_get($msgid)); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); undef $sa; @@ -213,13 +216,13 @@ $mail = $sa->parse( $raw_message ); -$body = $sa->{bayes_scanner}->get_body_from_msg($mail); +$body = getimpl->get_body_from_msg($mail); my $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail); ok($msgstatus); -my $score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body); +my $score = getimpl->scan($msgstatus, $mail, $body); # Pretty much we can't count on the data returned with such little training # so just make sure that the score wasn't equal to .5 which is the default @@ -238,11 +241,11 @@ $mail = $sa->parse( $raw_message ); -$body = $sa->{bayes_scanner}->get_body_from_msg($mail); +$body = getimpl->get_body_from_msg($mail); $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail); -$score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body); +$score = getimpl->scan($msgstatus, $mail, $body); # Pretty much we can't count on the data returned with such little training # so just make sure that the score wasn't equal to .5 which is the default @@ -252,7 +255,7 @@ } -ok($sa->{bayes_scanner}->{store}->clear_database()); +ok(getimpl->{store}->clear_database()); ok(!-e 'log/user_state/bayes_journal'); ok(!-e 'log/user_state/bayes_seen'); Index: t/bayesdbm_flock.t =================================================================== --- t/bayesdbm_flock.t (.../trunk) (revision 602913) +++ t/bayesdbm_flock.t (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -34,8 +34,12 @@ ok($sa); -ok($sa->{bayes_scanner}); +sub getimpl { + return $sa->call_plugins("learner_get_implementation"); +} +ok(getimpl && $sa->{bayes_scanner}); + ok(!$sa->{bayes_scanner}->is_scan_available()); open(MAIL,"< data/spam/001"); @@ -52,49 +56,49 @@ ok($mail); -my $body = $sa->{bayes_scanner}->get_body_from_msg($mail); +my $body = getimpl->get_body_from_msg($mail); ok($body); -my $toks = $sa->{bayes_scanner}->tokenize($mail, $body); +my $toks = getimpl->tokenize($mail, $body); ok(scalar(keys %{$toks}) > 0); -my($msgid,$msgid_hdr) = $sa->{bayes_scanner}->get_msgid($mail); +my($msgid,$msgid_hdr) = getimpl->get_msgid($mail); # $msgid is the generated hash messageid # $msgid_hdr is the Message-Id header ok($msgid eq 'ce33e4a8bc5798c65428d6018380bae346c7c126@sa_generated'); ok($msgid_hdr eq '9PS291LhupY'); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid)); +ok(!getimpl->{store}->seen_get($msgid)); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); ok($sa->{bayes_scanner}->learn(1, $mail)); ok(!$sa->{bayes_scanner}->learn(1, $mail)); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 's'); +ok(getimpl->{store}->seen_get($msgid) eq 's'); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); my $tokerror = 0; foreach my $tok (keys %{$toks}) { - my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok); + my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok); if ($spam == 0 || $ham > 0) { $tokerror = 1; } } ok(!$tokerror); -my $tokens = $sa->{bayes_scanner}->{store}->tok_get_all(keys %{$toks}); +my $tokens = getimpl->{store}->tok_get_all(keys %{$toks}); ok($tokens); @@ -107,36 +111,36 @@ } ok(!$tokerror); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); ok($sa->{bayes_scanner}->learn(0, $mail)); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 'h'); +ok(getimpl->{store}->seen_get($msgid) eq 'h'); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); $tokerror = 0; foreach my $tok (keys %{$toks}) { - my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok); + my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok); if ($spam > 0 || $ham == 0) { $tokerror = 1; } } ok(!$tokerror); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); ok($sa->{bayes_scanner}->forget($mail)); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid)); +ok(!getimpl->{store}->seen_get($msgid)); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); undef $sa; @@ -213,13 +217,13 @@ $mail = $sa->parse( $raw_message ); -$body = $sa->{bayes_scanner}->get_body_from_msg($mail); +$body = getimpl->get_body_from_msg($mail); my $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail); ok($msgstatus); -my $score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body); +my $score = getimpl->scan($msgstatus, $mail, $body); # Pretty much we can't count on the data returned with such little training # so just make sure that the score wasn't equal to .5 which is the default @@ -238,11 +242,11 @@ $mail = $sa->parse( $raw_message ); -$body = $sa->{bayes_scanner}->get_body_from_msg($mail); +$body = getimpl->get_body_from_msg($mail); $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail); -$score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body); +$score = getimpl->scan($msgstatus, $mail, $body); # Pretty much we can't count on the data returned with such little training # so just make sure that the score wasn't equal to .5 which is the default @@ -252,7 +256,7 @@ } -ok($sa->{bayes_scanner}->{store}->clear_database()); +ok(getimpl->{store}->clear_database()); ok(!-e 'log/user_state/bayes_journal'); ok(!-e 'log/user_state/bayes_seen'); Index: t/bayessdbm_seen_delete.t =================================================================== --- t/bayessdbm_seen_delete.t (.../trunk) (revision 602913) +++ t/bayessdbm_seen_delete.t (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -34,8 +34,12 @@ ok($sa); -ok($sa->{bayes_scanner}); +sub getimpl { + return $sa->call_plugins("learner_get_implementation"); +} +ok($sa->{bayes_scanner} && getimpl); + ok(!$sa->{bayes_scanner}->is_scan_available()); open(MAIL,"< data/spam/001"); @@ -52,49 +56,49 @@ ok($mail); -my $body = $sa->{bayes_scanner}->get_body_from_msg($mail); +my $body = getimpl->get_body_from_msg($mail); ok($body); -my $toks = $sa->{bayes_scanner}->tokenize($mail, $body); +my $toks = getimpl->tokenize($mail, $body); ok(scalar(keys %{$toks}) > 0); -my($msgid,$msgid_hdr) = $sa->{bayes_scanner}->get_msgid($mail); +my($msgid,$msgid_hdr) = getimpl->get_msgid($mail); # $msgid is the generated hash messageid # $msgid_hdr is the Message-Id header ok($msgid eq 'ce33e4a8bc5798c65428d6018380bae346c7c126@sa_generated'); ok($msgid_hdr eq '9PS291LhupY'); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid)); +ok(!getimpl->{store}->seen_get($msgid)); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); ok($sa->{bayes_scanner}->learn(1, $mail)); ok(!$sa->{bayes_scanner}->learn(1, $mail)); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 's'); +ok(getimpl->{store}->seen_get($msgid) eq 's'); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); my $tokerror = 0; foreach my $tok (keys %{$toks}) { - my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok); + my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok); if ($spam == 0 || $ham > 0) { $tokerror = 1; } } ok(!$tokerror); -my $tokens = $sa->{bayes_scanner}->{store}->tok_get_all(keys %{$toks}); +my $tokens = getimpl->{store}->tok_get_all(keys %{$toks}); ok($tokens); @@ -107,36 +111,36 @@ } ok(!$tokerror); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); ok($sa->{bayes_scanner}->learn(0, $mail)); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 'h'); +ok(getimpl->{store}->seen_get($msgid) eq 'h'); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); $tokerror = 0; foreach my $tok (keys %{$toks}) { - my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok); + my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok); if ($spam > 0 || $ham == 0) { $tokerror = 1; } } ok(!$tokerror); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); ok($sa->{bayes_scanner}->forget($mail)); -ok($sa->{bayes_scanner}->{store}->tie_db_writable()); +ok(getimpl->{store}->tie_db_writable()); -ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid)); +ok(!getimpl->{store}->seen_get($msgid)); -$sa->{bayes_scanner}->{store}->untie_db(); +getimpl->{store}->untie_db(); undef $sa; @@ -221,13 +225,13 @@ $mail = $sa->parse( $raw_message ); -$body = $sa->{bayes_scanner}->get_body_from_msg($mail); +$body = getimpl->get_body_from_msg($mail); my $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail); ok($msgstatus); -my $score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body); +my $score = getimpl->scan($msgstatus, $mail, $body); # Pretty much we can't count on the data returned with such little training # so just make sure that the score wasn't equal to .5 which is the default @@ -246,11 +250,11 @@ $mail = $sa->parse( $raw_message ); -$body = $sa->{bayes_scanner}->get_body_from_msg($mail); +$body = getimpl->get_body_from_msg($mail); $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail); -$score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body); +$score = getimpl->scan($msgstatus, $mail, $body); # Pretty much we can't count on the data returned with such little training # so just make sure that the score wasn't equal to .5 which is the default @@ -260,7 +264,7 @@ } -ok($sa->{bayes_scanner}->{store}->clear_database()); +ok(getimpl->{store}->clear_database()); ok(!-e 'log/user_state/bayes_journal'); ok(!-e 'log/user_state/bayes_seen.pag'); Index: lib/Mail/SpamAssassin/OSBF/Store.pm =================================================================== --- lib/Mail/SpamAssassin/OSBF/Store.pm (.../trunk) (revision 0) +++ lib/Mail/SpamAssassin/OSBF/Store.pm (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -0,0 +1,871 @@ +# <@LICENSE> +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to you under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +=head1 NAME + +Mail::SpamAssassin::OSBF::Store - OSBFian Storage Module + +=head1 DESCRIPTION + +This is the public API for the OSBFian store methods. Any implementation of +the storage module must implement these methods. + +=cut + +package Mail::SpamAssassin::OSBF::Store; + +use strict; +use warnings; +use bytes; +use re 'taint'; +use Mail::SpamAssassin::Logger; + +# TODO: if we ever get tuits, it'd be good to make these POD +# method docs more perlish... hardly a biggie. + +=head1 METHODS + +=over 4 + +=item new + +public class (Mail::SpamAssassin::OSBF::Store) new (Mail::SpamAssassin::Plugin::OSBF $bayes) + +Description: +This method creates a new instance of the +Mail::SpamAssassin::OSBF::Store object. You must pass in an instance of the +Mail::SpamAssassin::Plugin::OSBF object, which is stashed for use throughout +the module. + +=cut + +sub new { + my ($class, $osbf) = @_; + + $class = ref($class) || $class; + + my $self = { + 'osbf' => $osbf, + 'supported_db_version' => 0, + 'db_version' => undef, + }; + + bless ($self, $class); + + $self; +} + +=item DB_VERSION + +public instance (Integer) DB_VERSION () + +Description: +This method returns the currently supported database version for the +implementation. + +=cut + +sub DB_VERSION { + my ($self) = @_; + return $self->{supported_db_version}; +} + +=item read_db_configs + +public instance () read_db_configs () + +Description: +This method reads any needed config variables from the configuration +object and then calls the Mail::SpamAssassin::Plugin::OSBF read_db_configs method. + +=cut + +sub read_db_configs { + my ($self) = @_; + + # TODO: at some stage, this may be useful to read config items which + # control database bloat, like + # + # - use of hapaxes + # - use of case-sensitivity + # - more midrange-hapax-avoidance tactics when parsing headers (future) + # + # for now, we just set these settings statically. + my $conf = $self->{osbf}->{main}->{conf}; + + # Minimum desired database size? Expiry will not shrink the + # database below this number of entries. 100k entries is roughly + # equivalent to a 5Mb database file. + $self->{expiry_max_db_size} = $conf->{bayes_expiry_max_db_size}; + $self->{expiry_pct} = $conf->{bayes_expiry_pct}; + $self->{expiry_period} = $conf->{bayes_expiry_period}; + $self->{expiry_max_exponent} = $conf->{bayes_expiry_max_exponent}; +} + +=item tie_db_readonly + +public instance (Boolean) tie_db_readonly () + +Description: +This method opens up the database in readonly mode. + +=cut + +sub tie_db_readonly { + my ($self) = @_; + die "osbf: tie_db_readonly: not implemented\n"; +} + +=item tie_db_writable + +public instance (Boolean) tie_db_writable () + +Description: +This method opens up the database in writable mode. + +Any callers of this methods should ensure that they call untie_db() +afterwards. + +=cut + +sub tie_db_writable { + my ($self) = @_; + die "osbf: tie_db_writable: not implemented\n"; +} + +=item untie_db + +public instance () untie_db () + +Description: +This method unties the database. + +=cut + +sub untie_db { + my $self = shift; + die "osbf: untie_db: not implemented\n"; +} + +=item calculate_expire_delta + +public instance (%) calculate_expire_delta (Integer $newest_atime, + Integer $start, + Integer $max_expire_mult) + +Description: +This method performs a calculation on the data to determine the optimum +atime for token expiration. + +=cut + +sub calculate_expire_delta { + my ($self, $newest_atime, $start, $max_expire_mult) = @_; + die "osbf: calculate_expire_delta: not implemented\n"; +} + +=item token_expiration + +public instance (Integer, Integer, + Integer, Integer) token_expiration(\% $opts, + Integer $newest_atime, + Integer $newdelta) + +Description: +This method performs the database specific expiration of tokens based on +the passed in C<$newest_atime> and C<$newdelta>. + +=cut + +sub token_expiration { + my ($self, $opts, $newest_atime, $newdelta) = @_; + die "osbf: token_expiration: not implemented\n"; +} + +=item expire_old_tokens + +public instance (Boolean) expire_old_tokens (\% hashref) + +Description: +This method expires old tokens from the database. + +=cut + +sub expire_old_tokens { + my ($self, $opts) = @_; + my $ret; + + my $eval_stat; + eval { + local $SIG{'__DIE__'}; # do not run user die() traps in here + if ($self->tie_db_writable()) { + $ret = $self->expire_old_tokens_trapped ($opts); + } + 1; + } or do { + $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; + }; + + if (!$self->{osbf}->{main}->{learn_caller_will_untie}) { + $self->untie_db(); + } + + if (defined $eval_stat) { # if we died, untie the dbs. + warn "osbf: expire_old_tokens: $eval_stat\n"; + return 0; + } + $ret; +} + +=item expire_old_tokens_trapped + +public instance (Boolean) expire_old_tokens_trapped (\% $opts) + +Description: +This methods does the actual token expiration. + +XXX More docs here about the methodology and what not + +=cut + +sub expire_old_tokens_trapped { + my ($self, $opts) = @_; + + # Flag that we're doing work + $self->set_running_expire_tok(); + + # We don't need to do an expire, so why were we called? Oh well. + if (!$self->expiry_due()) { + $self->remove_running_expire_tok(); + return 0; + } + + my $started = time(); + my @vars = $self->get_storage_variables(); + + if ( $vars[10] > time ) { + dbg("osbf: expiry found newest atime in the future, resetting to current time"); + $vars[10] = time; + } + + # How many tokens do we want to keep? + my $goal_reduction = int($self->{expiry_max_db_size} * $self->{expiry_pct}); + dbg("osbf: expiry check keep size, ".$self->{expiry_pct}." * max: $goal_reduction"); + # Make sure we keep at least 100000 tokens in the DB + if ( $goal_reduction < 100000 ) { + $goal_reduction = 100000; + dbg("osbf: expiry keep size too small, resetting to 100,000 tokens"); + } + # Now turn goal_reduction into how many to expire. + $goal_reduction = $vars[3] - $goal_reduction; + dbg("osbf: token count: ".$vars[3].", final goal reduction size: $goal_reduction"); + + if ( $goal_reduction < 1000 ) { # too few tokens to expire, abort. + dbg("osbf: reduction goal of $goal_reduction is under 1,000 tokens, skipping expire"); + $self->set_last_expire(time()); + $self->remove_running_expire_tok(); # this won't be cleaned up, so do it now. + return 1; # we want to indicate things ran as expected + } + + # Estimate new atime delta based on the last atime delta + my $newdelta = 0; + if ( $vars[9] > 0 ) { + # newdelta = olddelta * old / goal; + # this may seem backwards, but since we're talking delta here, + # not actual atime, we want smaller atimes to expire more tokens, + # and visa versa. + # + $newdelta = int($vars[8] * $vars[9] / $goal_reduction); + } + + # Calculate size difference between last expiration token removal + # count and the current goal removal count. + my $ratio = ($vars[9] == 0 || $vars[9] > $goal_reduction) ? $vars[9]/$goal_reduction : $goal_reduction/$vars[9]; + + dbg("osbf: first pass? current: ".time().", Last: ".$vars[4].", atime: ".$vars[8].", count: ".$vars[9].", newdelta: $newdelta, ratio: $ratio, period: ".$self->{expiry_period}); + + ## ESTIMATION PHASE + # + # Do this for the first expire or "odd" looking results cause a first pass to determine atime: + # + # - last expire was more than 30 days ago + # assume mail flow stays roughly the same month to month, recompute if it's > 1 month + # - last atime delta was under expiry period + # if we're expiring often max_db_size should go up, but let's recompute just to check + # - last reduction count was < 1000 tokens + # ditto + # - new estimated atime delta is under expiry period + # ditto + # - difference of last reduction to current goal reduction is > 50% + # if the two values are out of balance, estimating atime is going to be funky, recompute + # + if ( (time() - $vars[4] > 86400*30) || ($vars[8] < $self->{expiry_period}) || ($vars[9] < 1000) + || ($newdelta < $self->{expiry_period}) || ($ratio > 1.5) ) { + dbg("osbf: can't use estimation method for expiry, unexpected result, calculating optimal atime delta (first pass)"); + + my $start = $self->{expiry_period}; # exponential search starting at ...? 1/2 day, 1, 2, 4, 8, 16, ... + my $max_expire_mult = 2**$self->{expiry_max_exponent}; # $max_expire_mult * $start = max expire time (256 days), power of 2. + + dbg("osbf: expiry max exponent: ".$self->{expiry_max_exponent}); + + my %delta = $self->calculate_expire_delta($vars[10], $start, $max_expire_mult); + + return 0 unless (%delta); + + # This will skip the for loop if debugging isn't enabled ... + if (would_log('dbg', 'bayes')) { + dbg("osbf: atime\ttoken reduction"); + dbg("osbf: ========\t==============="); + for(my $i = 1; $i<=$max_expire_mult; $i <<= 1) { + dbg("osbf: ".$start*$i."\t".(exists $delta{$i} ? $delta{$i} : 0)); + } + } + + # Now figure out which max_expire_mult value gives the closest results to goal_reduction, without + # going over ... Go from the largest delta backwards so the reduction size increases + # (tokens that expire at 4 also expire at 3, 2, and 1, so 1 will always be the largest expiry...) + # + for( ; $max_expire_mult > 0; $max_expire_mult>>=1 ) { + next unless exists $delta{$max_expire_mult}; + if ($delta{$max_expire_mult} > $goal_reduction) { + $max_expire_mult<<=1; # the max expire is actually the next power of 2 out + last; + } + } + + # if max_expire_mult gets to 0, either we can't expire anything, or 1 is <= $goal_reduction + $max_expire_mult ||= 1; + + # $max_expire_mult is now equal to the value we should use ... + # Check to see if the atime value we found is really good. + # It's not good if: + # - $max_expire_mult would not expire any tokens. This means that the majority of + # tokens are old or new, and more activity is required before an expiry can occur. + # - reduction count < 1000, not enough tokens to be worth doing an expire. + # + if ( !exists $delta{$max_expire_mult} || $delta{$max_expire_mult} < 1000 ) { + dbg("osbf: couldn't find a good delta atime, need more token difference, skipping expire"); + $self->set_last_expire(time()); + $self->remove_running_expire_tok(); # this won't be cleaned up, so do it now. + return 1; # we want to indicate things ran as expected + } + + $newdelta = $start * $max_expire_mult; + dbg("osbf: first pass decided on $newdelta for atime delta"); + } + else { # use the estimation method + dbg("osbf: can do estimation method for expiry, skipping first pass"); + } + + my ($kept, $deleted, $num_hapaxes, $num_lowfreq) = $self->token_expiration($opts, $newdelta, @vars); + + my $done = time(); + + my $msg = "expired old bayes database entries in ".($done - $started)." seconds"; + my $msg2 = "$kept entries kept, $deleted deleted"; + + if ($opts->{verbose}) { + my $hapax_pc = ($num_hapaxes * 100) / $kept; + my $lowfreq_pc = ($num_lowfreq * 100) / $kept; + print "$msg\n$msg2\n"; + printf "token frequency: 1-occurrence tokens: %3.2f%%\n", $hapax_pc; + printf "token frequency: less than 8 occurrences: %3.2f%%\n", $lowfreq_pc; + } + else { + dbg("osbf: $msg: $msg2"); + } + + return 1; +} + +=item sync_due + +public instance (Boolean) sync_due () + +Description: +This methods determines if a sync is due. + +=cut + +sub sync_due { + my ($self) = @_; + die "osbf: sync_due: not implemented\n"; +} + +=item expiry_due + +public instance (Boolean) expiry_due () + +Description: +This methods determines if an expire is due. + +=cut + +sub expiry_due { + my ($self) = @_; + + $self->read_db_configs(); # make sure this has happened here + + # If force expire was called, do the expire no matter what. + return 1 if ($self->{osbf}->{main}->{learn_force_expire}); + + # if config says not to auto expire then no need to continue + return 0 if ($self->{osbf}->{main}->{conf}->{bayes_auto_expire} == 0); + + # is the database too small for expiry? (Do *not* use "scalar keys", + # as this will iterate through the entire db counting them!) + my @vars = $self->get_storage_variables(); + my $ntoks = $vars[3]; + + my $last_expire = time() - $vars[4]; + if (!$self->{osbf}->{main}->{ignore_safety_expire_timeout}) { + # if we're not ignoring the safety timeout, don't run an expire more + # than once every 12 hours. + return 0 if ($last_expire < 43200); + } + else { + # if we are ignoring the safety timeout (e.g.: mass-check), still + # limit the expiry to only one every 5 minutes. + return 0 if ($last_expire < 300); + } + + dbg("osbf: DB expiry: tokens in DB: $ntoks, Expiry max size: ".$self->{expiry_max_db_size}.", Oldest atime: ".$vars[5].", Newest atime: ".$vars[10].", Last expire: ".$vars[4].", Current time: ".time(),'bayes','-1'); + + my $conf = $self->{osbf}->{main}->{conf}; + if ($ntoks <= 100000 || # keep at least 100k tokens + $self->{expiry_max_db_size} > $ntoks || # not enough tokens to cause an expire + $vars[10]-$vars[5] < 43200 || # delta between oldest and newest < 12h + $self->{db_version} < $self->DB_VERSION # ignore old db formats + ) { + return 0; + } + + return 1; +} + +=item seen_get + +public instance (Char) seen_get (String $msgid) + +Description: +This method retrieves the stored value, if any, for C<$msgid>. The return +value is the stored string ('s' for spam and 'h' for ham) or undef if +C<$msgid> is not found. Plus additional values (TODO) + +=cut + +sub seen_get { + my ($self, $msgid) = @_; + die "osbf: seen_get: not implemented\n"; +} + +=item seen_put + +public instance (Boolean) seen_put (String $msgid, Char $flag) + +Description: +This method records C<$msgid> as the type given by C<$flag>. C<$flag> is +one of two values 's' for spam and 'h' for ham, plus additional +values (TODO). + +=cut + +sub seen_put { + my ($self, $msgid, $flag) = @_; + die "osbf: seen_put: not implemented\n"; +} + +=item seen_delete + +public instance (Boolean) seen_delete (String $msgid) + +Description: +This method removes C<$msgid> from storage. + +=cut + +sub seen_delete { + my ($self, $msgid) = @_; + die "osbf: seen_delete: not implemented\n"; +} + +=item get_storage_variables + +public instance (@) get_storage_variables () + +Description: +This method retrieves the various administrative variables used by +the OSBF storage implementation. + +The values returned in the array are in the following order: + +0: scan count base + +1: number of spam + +2: number of ham + +3: number of tokens in db + +4: last expire atime + +5: oldest token in db atime + +6: db version value + +7: last journal sync + +8: last atime delta + +9: last expire reduction count + +10: newest token in db atime + +=cut + +sub get_storage_variables { + my ($self) = @_; + die "osbf: get_storage_variables: not implemented\n"; +} + +=item dump_db_toks + +public instance () dump_db_toks (String $template, String $regex, @ @vars) + +Description: +This method loops over all tokens, computing the probability for the token +and then printing it out according to the passed in template. + +=cut + +sub dump_db_toks { + my ($self, $template, $regex, @vars) = @_; + die "osbf: dump_db_toks: not implemented\n"; +} + +=item set_last_expire + +public instance (Boolean) _set_last_expire (Integer $time) + +Description: +This method sets the last expire time. + +=cut + +sub set_last_expire { + my ($self, $time) = @_; + die "osbf: set_last_expire: not implemented\n"; +} + +=item get_running_expire_tok + +public instance (Time) get_running_expire_tok () + +Description: +This method determines if an expire is currently running and returns the time +the expire started. + +=cut + +sub get_running_expire_tok { + my ($self) = @_; + die "osbf: get_running_expire_tok: not implemented\n"; +} + +=item set_running_expire_tok + +public instance (Time) set_running_expire_tok () + +Description: +This method sets the running expire time to the current time. + +=cut + +sub set_running_expire_tok { + my ($self) = @_; + die "osbf: set_running_expire_tok: not implemented\n"; +} + +=item remove_running_expire_tok + +public instance (Boolean) remove_running_expire_tok () + +Description: +This method removes a currently set running expire time. + +=cut + +sub remove_running_expire_tok { + my ($self) = @_; + die "osbf: remove_running_expire_tok: not implemented\n"; +} + +=item tok_get + +public instance (Integer, Integer, Time) tok_get (String $token) + +Description: +This method retrieves the specified token (C<$token>) from storage and returns +it's spam count, ham acount and last access time. + +=cut + +sub tok_get { + my ($self, $token) = @_; + die "osbf: tok_get: not implemented\n"; +} + +=item tok_get_all + +public instance (\@) tok_get_all (@ @tokens) + +Description: +This method retrieves the specified tokens (C<@tokens>) from storage and +returns an array ref of arrays spam count, ham count and last access time. + +=cut + +sub tok_get_all { + my ($self, $tokens) = @_; + die "osbf: tok_get_all: not implemented\n"; +} + +=item multi_tok_value_change + +public instance (Boolean) multi_tok_value_change (\% $weights, + String $atime) + +Description: +This method takes a C<$weights> hash ref and writes the associated +[ spam, ham ] value pairs to the store, along with updating each tokens +atime with C<$atime>. + +=cut + +sub multi_tok_value_change { + my ($self, $weights, $atime) = @_; + die "osbf: multi_tok_value_change: not implemented\n"; +} + +=item nspam_nham_get + +public instance (Integer, Integer) nspam_nham_get () + +Description: +This method retrieves the total number of spam and the total number of ham +currently under storage. + +=cut + +sub nspam_nham_get { + my ($self) = @_; + die "osbf: nspam_nham_get: not implemented\n"; +} + +=item nspam_nham_change + +public instance (Boolean) nspam_nham_change (Integer $num_spam, + Integer $num_ham) + +Description: +This method updates the number of spam and the number of ham in the database. + +=cut + +sub nspam_nham_change { + my ($self, $num_spam, $num_ham) = @_; + die "osbf: nspam_nham_change: not implemented\n"; +} + +=item tok_touch + +public instance (Boolean) tok_touch (String $token, + Time $atime) + +Description: +This method updates the given tokens (C<$token>) access time. + +=cut + +sub tok_touch { + my ($self, $token, $atime) = @_; + die "osbf: tok_touch: not implemanted\n"; +} + +=item tok_touch_all + +public instance (Boolean) tok_touch_all (\@ $tokens, + Time $atime) + +Description: +This method does a mass update of the given list of tokens C<$tokens>, if the existing token +atime is < C<$atime>. + +=cut + +sub tok_touch_all { + my ($self, $tokens, $atime) = @_; + die "osbf: tok_touch_all: not implemanted\n"; +} + +=item cleanup + +public instance (Boolean) cleanup () + +Description: +This method performs any cleanup necessary before moving onto the next +operation. + +=cut + +sub cleanup { + my ($self) = @_; + die "osbf: cleanup: not implemented\n"; +} + +=item get_magic_re + +public instance get_magic_re (String) + +Description: +This method returns a regexp which indicates a magic token. + +=cut + +sub get_magic_re { + my ($self) = @_; + die "osbf: get_magic_re: not implemented\n"; +} + +=item sync + +public instance (Boolean) sync (\% $opts) + +Description: +This method performs a sync of the database. + +=cut + +sub sync { + my ($self, $opts) = @_; + die "osbf: sync: not implemented\n"; +} + +=item perform_upgrade + +public instance (Boolean) perform_upgrade (\% $opts) + +Description: +This method is a utility method that performs any necessary upgrades +between versions. It should know how to handle previous versions and +what needs to happen to upgrade them. + +A true return value indicates success. + +=cut + +sub perform_upgrade { + my ($self, $opts) = @_; + die "osbf: perform_upgrade: not implemented\n"; +} + +=item clear_database + +public instance (Boolean) clear_database () + +Description: +This method deletes all records for a particular user. + +Callers should be aware that any errors returned by this method +could causes the database to be inconsistent for the given user. + +=cut + +sub clear_database { + my ($self) = @_; + die "osbf: clear_database: not implemented\n"; +} + +=item backup_database + +public instance (Boolean) backup_database () + +Description: +This method will dump the users database in a machine readable format. + +=cut + +sub backup_database { + my ($self) = @_; + die "osbf: backup_database: not implemented\n"; +} + +=item restore_database + +public instance (Boolean) restore_database (String $filename, Boolean $showdots) + +Description: +This method restores a database from the given filename, C<$filename>. + +Callers should be aware that any errors returned by this method +could causes the database to be inconsistent for the given user. + +=cut + +sub restore_database { + my ($self, $filename, $showdots) = @_; + die "osbf: restore_database: not implemented\n"; +} + +=item db_readable + +public instance (Boolean) db_readable () + +Description: +This method returns whether or not the OSBF DB is available in a +readable state. + +=cut + +sub db_readable { + my ($self) = @_; + die "osbf: db_readable: not implemented\n"; +} + +=item db_writable + +public instance (Boolean) db_writable () + +Description: +This method returns whether or not the OSBF DB is available in a +writable state. + +=cut + +sub db_writable { + my ($self) = @_; + die "osbf: db_writable: not implemented\n"; +} + + +sub sa_die { Mail::SpamAssassin::sa_die(@_); } + +1; + +=back + +=cut Index: lib/Mail/SpamAssassin/OSBF/Store/DBM.pm =================================================================== --- lib/Mail/SpamAssassin/OSBF/Store/DBM.pm (.../trunk) (revision 0) +++ lib/Mail/SpamAssassin/OSBF/Store/DBM.pm (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -0,0 +1,1906 @@ +# <@LICENSE> +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to you under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +package Mail::SpamAssassin::OSBF::Store::DBM; + +use strict; +use warnings; +use bytes; +use Fcntl; +use re 'taint'; + +use Mail::SpamAssassin; +use Mail::SpamAssassin::Util qw(untaint_var); +use Mail::SpamAssassin::OSBF::Store; +use Mail::SpamAssassin::Logger; +use Digest::SHA1 qw(sha1); +use File::Basename; +use File::Spec; +use File::Path; + +use constant MAGIC_RE => qr/^\015\001\007\011\003/; + +use vars qw{ + @ISA + @DBNAMES + $NSPAM_MAGIC_TOKEN $NHAM_MAGIC_TOKEN $LAST_EXPIRE_MAGIC_TOKEN $LAST_JOURNAL_SYNC_MAGIC_TOKEN + $NTOKENS_MAGIC_TOKEN $OLDEST_TOKEN_AGE_MAGIC_TOKEN $LAST_EXPIRE_REDUCE_MAGIC_TOKEN + $RUNNING_EXPIRE_MAGIC_TOKEN $DB_VERSION_MAGIC_TOKEN $LAST_ATIME_DELTA_MAGIC_TOKEN + $NEWEST_TOKEN_AGE_MAGIC_TOKEN +}; + +@ISA = qw( Mail::SpamAssassin::OSBF::Store ); + +@DBNAMES = qw(toks seen); + +# These are the magic tokens we use to track stuff in the DB. +# The format is '^M^A^G^I^C' followed by any string you want. +# None of the control chars will be in a real token. +$DB_VERSION_MAGIC_TOKEN = "\015\001\007\011\003DBVERSION"; +$LAST_ATIME_DELTA_MAGIC_TOKEN = "\015\001\007\011\003LASTATIMEDELTA"; +$LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE"; +$LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE"; +$LAST_JOURNAL_SYNC_MAGIC_TOKEN = "\015\001\007\011\003LASTJOURNALSYNC"; +$NEWEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003NEWESTAGE"; +$NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM"; +$NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM"; +$NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS"; +$OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE"; +$RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE"; + +sub HAS_DBM_MODULE { + my ($self) = @_; + if (exists($self->{has_dbm_module})) { + return $self->{has_dbm_module}; + } + $self->{has_dbm_module} = eval { require DB_File; }; +} + +sub DBM_MODULE { + return "DB_File"; +} + +# Possible file extensions used by the kinds of database files DB_File +# might create. We need these so we can create a new file and rename +# it into place. +sub DB_EXTENSIONS { + return ('', '.db'); +} + +########################################################################### + +sub new { + my $class = shift; + $class = ref($class) || $class; + + my $self = $class->SUPER::new(@_); + + $self->{supported_db_version} = 3; + + $self->{already_tied} = 0; + $self->{is_locked} = 0; + $self->{string_to_journal} = ''; + $self->{last_refresh} = 0; + + $self; +} + +########################################################################### + +sub tie_db_readonly { + my ($self) = @_; + + if (!$self->HAS_DBM_MODULE) { + dbg("osbf: " . $self->DBM_MODULE . " module not installed, cannot use OSBF"); + return 0; + } + + # return if we've already tied to the db's, using the same mode + # (locked/unlocked) as before. + return 1 if ($self->{already_tied} && $self->{is_locked} == 0); + + my $main = $self->{osbf}->{main}; + if (!defined($main->{conf}->{osbf_path})) { + dbg("osbf: osbf_path not defined"); + return 0; + } + + $self->read_db_configs(); + + my $path = $main->sed_path($main->{conf}->{osbf_path}); + + my $found = 0; + for my $ext ($self->DB_EXTENSIONS) { + if (-f $path.'_toks'.$ext) { + $found = 1; + last; + } + } + + if (!$found) { + dbg("osbf: no dbs present, cannot tie DB R/O: ${path}_toks"); + return 0; + } + + foreach my $dbname (@DBNAMES) { + my $name = $path.'_'.$dbname; + my $db_var = 'db_'.$dbname; + dbg("osbf: tie-ing to DB file R/O $name"); + + # untie %{$self->{$db_var}} if (tied %{$self->{$db_var}}); + if (!tie %{$self->{$db_var}},$self->DBM_MODULE, $name, O_RDONLY, + (oct($main->{conf}->{osbf_file_mode}) & 0666)) + { + # bug 2975: it's acceptable for the db_seen to not be present, + # to allow it to be recycled. if that's the case, just create + # a new, empty one. we don't need to lock it, since we won't + # be writing to it; let the R/W api deal with that case. + + if ($dbname eq 'seen') { + tie %{$self->{$db_var}},$self->DBM_MODULE, $name, O_RDWR|O_CREAT, + (oct($main->{conf}->{osbf_file_mode}) & 0666) + or goto failed_to_tie; + } + else { + goto failed_to_tie; + } + } + } + + $self->{db_version} = ($self->get_storage_variables())[6]; + dbg("osbf: found bayes db version ".$self->{db_version}); + + # If the DB version is one we don't understand, abort! + if ($self->_check_db_version() != 0) { + warn("osbf: bayes db version ".$self->{db_version}." is not able to be used, aborting!"); + $self->untie_db(); + return 0; + } + + $self->{already_tied} = 1; + return 1; + +failed_to_tie: + warn "osbf: cannot open bayes databases ${path}_* R/O: tie failed: $!\n"; + foreach my $dbname (@DBNAMES) { + my $db_var = 'db_'.$dbname; + next unless exists $self->{$db_var}; + dbg("osbf: untie-ing DB file $dbname"); + untie %{$self->{$db_var}}; + } + + return 0; +} + +# tie() to the databases, read-write and locked. Any callers of +# this should ensure they call untie_db() afterwards! +# +sub tie_db_writable { + my ($self) = @_; + + if (!$self->HAS_DBM_MODULE) { + dbg("osbf: " . $self->DBM_MODULE . " module not installed, cannot use bayes"); + return 0; + } + + # Useful shortcut ... + my $main = $self->{osbf}->{main}; + + # if we've already tied the db's using the same mode + # (locked/unlocked) as we want now, freshen the lock and return. + if ($self->{already_tied} && $self->{is_locked} == 1) { + + # don't refresh too frequently; once every 15 secs should be plenty + my $now = time; + if (!$self->{last_refresh} || ($now - $self->{last_refresh} > 15)) { + $self->{last_refresh} = $now; + $main->{locker}->refresh_lock($self->{locked_file}); + } + + return 1; + } + + if (!defined($main->{conf}->{osbf_path})) { + dbg("osbf: osbf_path not defined"); + return 0; + } + + $self->read_db_configs(); + + my $path = $main->sed_path($main->{conf}->{osbf_path}); + + my $found = 0; + for my $ext ($self->DB_EXTENSIONS) { + if (-f $path.'_toks'.$ext) { + $found = 1; + last; + } + } + + my $parentdir = dirname($path); + if (!-d $parentdir) { + # run in an eval(); if mkpath has no perms, it calls die() + eval { + mkpath($parentdir, 0, (oct($main->{conf}->{osbf_file_mode}) & 0777)); + }; + } + + my $tout; + if ($main->{learn_wait_for_lock}) { + $tout = 300; # TODO: Dan to write better lock code + } else { + $tout = 10; + } + if ($main->{locker}->safe_lock($path, $tout, $main->{conf}->{osbf_file_mode})) + { + $self->{locked_file} = $path; + $self->{is_locked} = 1; + } else { + warn "osbf: cannot open bayes databases ${path}_* R/W: lock failed: $!\n"; + return 0; + } + + my $umask = umask 0; + foreach my $dbname (@DBNAMES) { + my $name = $path.'_'.$dbname; + my $db_var = 'db_'.$dbname; + dbg("osbf: tie-ing to DB file R/W $name"); + + ($self->DBM_MODULE eq 'DB_File') and + Mail::SpamAssassin::Util::avoid_db_file_locking_bug ($name); + + tie %{$self->{$db_var}},$self->DBM_MODULE, $name, O_RDWR|O_CREAT, + (oct($main->{conf}->{osbf_file_mode}) & 0666) + or goto failed_to_tie; + } + umask $umask; + + # set our cache to what version DB we're using + $self->{db_version} = ($self->get_storage_variables())[6]; + # don't bother printing this unless found since it would be bogus anyway + dbg("osbf: found bayes db version ".$self->{db_version}) if ($found); + + # figure out if we can read the current DB and if we need to do a + # DB version update and do it if necessary if either has a problem, + # fail immediately + # + if ($found && !$self->_upgrade_db()) { + $self->untie_db(); + return 0; + } + elsif (!$found) { # new DB, make sure we know that ... + $self->{db_version} = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION; + $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN} = 0; # no tokens in the db ... + dbg("osbf: new db, set db version ".$self->{db_version}." and 0 tokens"); + } + + $self->{already_tied} = 1; + return 1; + +failed_to_tie: + my $err = $!; + umask $umask; + + foreach my $dbname (@DBNAMES) { + my $db_var = 'db_'.$dbname; + next unless exists $self->{$db_var}; + dbg("osbf: untie-ing DB file $dbname"); + untie %{$self->{$db_var}}; + } + + if ($self->{is_locked}) { + $self->{osbf}->{main}->{locker}->safe_unlock($self->{locked_file}); + $self->{is_locked} = 0; + } + warn "osbf: cannot open bayes databases ${path}_* R/W: tie failed: $err\n"; + return 0; +} + +# Do we understand how to deal with this DB version? +sub _check_db_version { + my ($self) = @_; + + # return -1 if older, 0 if current, 1 if newer + return $self->{db_version} <=> $self->DB_VERSION; +} + +# Check to see if we need to upgrade the DB, and do so if necessary +sub _upgrade_db { + my ($self) = @_; + + my $verschk = $self->_check_db_version(); + my $res = 0; # used later on for tie() checks + my $umask; # used later for umask modifications + + # If the DB is the latest version, no problem. + return 1 if ($verschk == 0); + + # If the DB is a newer version that we know what to do with ... abort! + if ($verschk == 1) { + warn("osbf: bayes db version ".$self->{db_version}." is newer than we understand, aborting!"); + return 0; + } + + # If the current DB version is lower than the new version, upgrade! + # Do conversions in order so we can go 1 -> 3, make sure to update + # $self->{db_version} along the way + + dbg("osbf: detected bayes db format ".$self->{db_version}.", upgrading"); + + # since DB_File will not shrink a database (!!), we need to *create* + # a new one instead. + my $main = $self->{osbf}->{main}; + my $path = $main->sed_path($main->{conf}->{osbf_path}); + my $name = $path.'_toks'; + + # older version's journal files are likely not in the same format as the new ones, so remove it. + my $jpath = $self->_get_journal_filename(); + if (-f $jpath) { + dbg("osbf: old journal file found, removing"); + warn "osbf: couldn't remove $jpath: $!" if (!unlink $jpath); + } + + if ($self->{db_version} < 2) { + dbg("osbf: upgrading database format from v".$self->{db_version}." to v2"); + $self->set_running_expire_tok(); + + my ($DB_NSPAM_MAGIC_TOKEN, $DB_NHAM_MAGIC_TOKEN, $DB_NTOKENS_MAGIC_TOKEN); + my ($DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN, $DB_LAST_EXPIRE_MAGIC_TOKEN); + + # Magic tokens for version 0, defined as '**[A-Z]+' + if ($self->{db_version} == 0) { + $DB_NSPAM_MAGIC_TOKEN = '**NSPAM'; + $DB_NHAM_MAGIC_TOKEN = '**NHAM'; + $DB_NTOKENS_MAGIC_TOKEN = '**NTOKENS'; + #$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN = '**OLDESTAGE'; + #$DB_LAST_EXPIRE_MAGIC_TOKEN = '**LASTEXPIRE'; + #$DB_SCANCOUNT_BASE_MAGIC_TOKEN = '**SCANBASE'; + #$DB_RUNNING_EXPIRE_MAGIC_TOKEN = '**RUNNINGEXPIRE'; + } + else { + $DB_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM"; + $DB_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM"; + $DB_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS"; + #$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE"; + #$DB_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE"; + #$DB_SCANCOUNT_BASE_MAGIC_TOKEN = "\015\001\007\011\003SCANBASE"; + #$DB_RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE"; + } + + # remember when we started ... + my $started = time; + my $newatime = $started; + + # use O_EXCL to avoid races (bonus paranoia, since we should be locked + # anyway) + my %new_toks; + $umask = umask 0; + $res = tie %new_toks, $self->DBM_MODULE, "${name}.new", O_RDWR|O_CREAT|O_EXCL, + (oct($main->{conf}->{osbf_file_mode}) & 0666); + umask $umask; + return 0 unless $res; + undef $res; + + # add the magic tokens to the new db. + $new_toks{$NSPAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NSPAM_MAGIC_TOKEN}; + $new_toks{$NHAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NHAM_MAGIC_TOKEN}; + $new_toks{$NTOKENS_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NTOKENS_MAGIC_TOKEN}; + $new_toks{$DB_VERSION_MAGIC_TOKEN} = 2; # we're now a DB version 2 file + $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $newatime; + $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = $newatime; + $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $newatime; + $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $newatime; + $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0; + $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0; + + # deal with the data tokens + my ($tok, $packed); + my $count = 0; + while (($tok, $packed) = each %{$self->{db_toks}}) { + next if ($tok =~ /^(?:\*\*[A-Z]+$|\015\001\007\011\003)/); # skip magic tokens + + my ($ts, $th, $atime) = $self->tok_unpack($packed); + $new_toks{$tok} = $self->tok_pack($ts, $th, $newatime); + + # Refresh the lock every so often... + if (($count++ % 1000) == 0) { + $self->set_running_expire_tok(); + } + } + + + # now untie so we can do renames + untie %{$self->{db_toks}}; + untie %new_toks; + + # This is the critical phase (moving files around), so don't allow + # it to be interrupted. + local $SIG{'INT'} = 'IGNORE'; + local $SIG{'TERM'} = 'IGNORE'; + local $SIG{'HUP'} = 'IGNORE' if (!Mail::SpamAssassin::Util::am_running_on_windows()); + + # older versions used scancount, so kill the stupid little file ... + my $msgc = $path.'_msgcount'; + if (-f $msgc) { + dbg("osbf: old msgcount file found, removing"); + if (!unlink $msgc) { + warn "osbf: couldn't remove $msgc: $!"; + } + } + + # now rename in the new one. Try several extensions + for my $ext ($self->DB_EXTENSIONS) { + my $newf = $name.'.new'.$ext; + my $oldf = $name.$ext; + next unless (-f $newf); + if (!rename ($newf, $oldf)) { + warn "osbf: rename $newf to $oldf failed: $!\n"; + return 0; + } + } + + # re-tie to the new db in read-write mode ... + $umask = umask 0; + $res = tie %{$self->{db_toks}},$self->DBM_MODULE, $name, O_RDWR|O_CREAT, + (oct($main->{conf}->{osbf_file_mode}) & 0666); + umask $umask; + return 0 unless $res; + undef $res; + + dbg("osbf: upgraded database format from v".$self->{db_version}." to v2 in ".(time - $started)." seconds"); + $self->{db_version} = 2; # need this for other functions which check + } + + # Version 3 of the database converts all existing tokens to SHA1 hashes + if ($self->{db_version} == 2) { + dbg("osbf: upgrading database format from v".$self->{db_version}." to v3"); + $self->set_running_expire_tok(); + + my $DB_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM"; + my $DB_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM"; + my $DB_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS"; + my $DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE"; + my $DB_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE"; + my $DB_NEWEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003NEWESTAGE"; + my $DB_LAST_JOURNAL_SYNC_MAGIC_TOKEN = "\015\001\007\011\003LASTJOURNALSYNC"; + my $DB_LAST_ATIME_DELTA_MAGIC_TOKEN = "\015\001\007\011\003LASTATIMEDELTA"; + my $DB_LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE"; + + # remember when we started ... + my $started = time; + + # use O_EXCL to avoid races (bonus paranoia, since we should be locked + # anyway) + my %new_toks; + $umask = umask 0; + $res = tie %new_toks, $self->DBM_MODULE, "${name}.new", O_RDWR|O_CREAT|O_EXCL, + (oct($main->{conf}->{osbf_file_mode}) & 0666); + umask $umask; + return 0 unless $res; + undef $res; + + # add the magic tokens to the new db. + $new_toks{$NSPAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NSPAM_MAGIC_TOKEN}; + $new_toks{$NHAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NHAM_MAGIC_TOKEN}; + $new_toks{$NTOKENS_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NTOKENS_MAGIC_TOKEN}; + $new_toks{$DB_VERSION_MAGIC_TOKEN} = 3; # we're now a DB version 3 file + $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $self->{db_toks}->{$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN}; + $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = $self->{db_toks}->{$DB_LAST_EXPIRE_MAGIC_TOKEN}; + $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NEWEST_TOKEN_AGE_MAGIC_TOKEN}; + $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $self->{db_toks}->{$DB_LAST_JOURNAL_SYNC_MAGIC_TOKEN}; + $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = $self->{db_toks}->{$DB_LAST_ATIME_DELTA_MAGIC_TOKEN}; + $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} =$self->{db_toks}->{$DB_LAST_EXPIRE_REDUCE_MAGIC_TOKEN}; + + # deal with the data tokens + my $count = 0; + while (my ($tok, $packed) = each %{$self->{db_toks}}) { + next if ($tok =~ /^\015\001\007\011\003/); # skip magic tokens + my $tok_hash = substr(sha1($tok), -5); + $new_toks{$tok_hash} = $packed; + + # Refresh the lock every so often... + if (($count++ % 1000) == 0) { + $self->set_running_expire_tok(); + } + } + + # now untie so we can do renames + untie %{$self->{db_toks}}; + untie %new_toks; + + # This is the critical phase (moving files around), so don't allow + # it to be interrupted. + local $SIG{'INT'} = 'IGNORE'; + local $SIG{'TERM'} = 'IGNORE'; + local $SIG{'HUP'} = 'IGNORE' if (!Mail::SpamAssassin::Util::am_running_on_windows()); + + # now rename in the new one. Try several extensions + for my $ext ($self->DB_EXTENSIONS) { + my $newf = $name.'.new'.$ext; + my $oldf = $name.$ext; + next unless (-f $newf); + if (!rename($newf, $oldf)) { + warn "osbf: rename $newf to $oldf failed: $!\n"; + return 0; + } + } + + # re-tie to the new db in read-write mode ... + $umask = umask 0; + $res = tie %{$self->{db_toks}},$self->DBM_MODULE, $name, O_RDWR|O_CREAT, + (oct ($main->{conf}->{osbf_file_mode}) & 0666); + umask $umask; + return 0 unless $res; + undef $res; + + dbg("osbf: upgraded database format from v".$self->{db_version}." to v3 in ".(time - $started)." seconds"); + + $self->{db_version} = 3; # need this for other functions which check + } + + # if ($self->{db_version} == 3) { + # ... + # $self->{db_version} = 4; # need this for other functions which check + # } + # ... and so on. + + return 1; +} + +########################################################################### + +sub untie_db { + my $self = shift; + + return if (!$self->{already_tied}); + + dbg("osbf: untie-ing"); + + foreach my $dbname (@DBNAMES) { + my $db_var = 'db_'.$dbname; + + if (exists $self->{$db_var}) { + # dbg("osbf: untie-ing $db_var"); + untie %{$self->{$db_var}}; + delete $self->{$db_var}; + } + } + + if ($self->{is_locked}) { + dbg("osbf: files locked, now unlocking lock"); + $self->{osbf}->{main}->{locker}->safe_unlock ($self->{locked_file}); + $self->{is_locked} = 0; + } + + $self->{already_tied} = 0; + $self->{db_version} = undef; +} + +########################################################################### + +sub calculate_expire_delta { + my ($self, $newest_atime, $start, $max_expire_mult) = @_; + + my %delta; # use a hash since an array is going to be very sparse + + # do the first pass, figure out atime delta + my ($tok, $packed); + while (($tok, $packed) = each %{$self->{db_toks}}) { + next if ($tok =~ MAGIC_RE); # skip magic tokens + + my ($ts, $th, $atime) = $self->tok_unpack ($packed); + + # Go through from $start * 1 to $start * 512, mark how many tokens + # we would expire + my $token_age = $newest_atime - $atime; + for (my $i = 1; $i <= $max_expire_mult; $i<<=1) { + if ($token_age >= $start * $i) { + $delta{$i}++; + } + else { + # If the token age is less than the expire delta, it'll be + # less for all upcoming checks too, so abort early. + last; + } + } + } + return %delta; +} + +########################################################################### + +sub token_expiration { + my ($self, $opts, $newdelta, @vars) = @_; + + my $deleted = 0; + my $kept = 0; + my $num_hapaxes = 0; + my $num_lowfreq = 0; + + # since DB_File will not shrink a database (!!), we need to *create* + # a new one instead. + my $main = $self->{osbf}->{main}; + my $path = $main->sed_path($main->{conf}->{osbf_path}); + + # use a temporary PID-based suffix just in case another one was + # created previously by an interrupted expire + my $tmpsuffix = "expire$$"; + my $tmpdbname = $path.'_toks.'.$tmpsuffix; + + # clean out any leftover db copies from previous runs + for my $ext ($self->DB_EXTENSIONS) { unlink ($tmpdbname.$ext); } + + # use O_EXCL to avoid races (bonus paranoia, since we should be locked + # anyway) + my %new_toks; + my $umask = umask 0; + tie %new_toks, $self->DBM_MODULE, $tmpdbname, O_RDWR|O_CREAT|O_EXCL, + (oct ($main->{conf}->{osbf_file_mode}) & 0666); + umask $umask; + my $oldest; + + my $showdots = $opts->{showdots}; + if ($showdots) { print STDERR "\n"; } + + # We've chosen a new atime delta if we've gotten here, so record it + # for posterity. + $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = $newdelta; + + # Figure out how old is too old... + my $too_old = $vars[10] - $newdelta; # tooold = newest - delta + + # Go ahead and do the move to new db/expire run now ... + my ($tok, $packed); + while (($tok, $packed) = each %{$self->{db_toks}}) { + next if ($tok =~ MAGIC_RE); # skip magic tokens + + my ($ts, $th, $atime) = $self->tok_unpack ($packed); + + if ($atime < $too_old) { + $deleted++; + } + else { + # if token atime > newest, reset to newest ... + if ($atime > $vars[10]) { + $atime = $vars[10]; + } + + $new_toks{$tok} = $self->tok_pack ($ts, $th, $atime); $kept++; + if (!defined($oldest) || $atime < $oldest) { $oldest = $atime; } + if ($ts + $th == 1) { + $num_hapaxes++; + } elsif ($ts < 8 && $th < 8) { + $num_lowfreq++; + } + } + + if ((($kept + $deleted) % 1000) == 0) { + if ($showdots) { print STDERR "."; } + $self->set_running_expire_tok(); + } + } + + # and add the magic tokens. don't add the expire_running token. + $new_toks{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION; + + # We haven't changed messages of each type seen, so just copy over. + $new_toks{$NSPAM_MAGIC_TOKEN} = $vars[1]; + $new_toks{$NHAM_MAGIC_TOKEN} = $vars[2]; + + # We magically haven't removed the newest token, so just copy that value over. + $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $vars[10]; + + # The rest of these have been modified, so replace as necessary. + $new_toks{$NTOKENS_MAGIC_TOKEN} = $kept; + $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = time(); + $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $oldest; + $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = $deleted; + + # Sanity check: if we expired too many tokens, abort! + if ($kept < 100000) { + dbg("osbf: token expiration would expire too many tokens, aborting"); + # set the magic tokens appropriately + # make sure the next expire run does a first pass + $self->{db_toks}->{$LAST_EXPIRE_MAGIC_TOKEN} = time(); + $self->{db_toks}->{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0; + $self->{db_toks}->{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0; + + # remove the new DB + untie %new_toks; + for my $ext ($self->DB_EXTENSIONS) { unlink ($tmpdbname.$ext); } + + # reset the results for the return + $kept = $vars[3]; + $deleted = 0; + $num_hapaxes = 0; + $num_lowfreq = 0; + } + else { + # now untie so we can do renames + untie %{$self->{db_toks}}; + untie %new_toks; + + # This is the critical phase (moving files around), so don't allow + # it to be interrupted. Scope the signal changes. + { + local $SIG{'INT'} = 'IGNORE'; + local $SIG{'TERM'} = 'IGNORE'; + local $SIG{'HUP'} = 'IGNORE' if (!Mail::SpamAssassin::Util::am_running_on_windows()); + + # now rename in the new one. Try several extensions + for my $ext ($self->DB_EXTENSIONS) { + my $newf = $tmpdbname.$ext; + my $oldf = $path.'_toks'.$ext; + next unless (-f $newf); + if (!rename ($newf, $oldf)) { + warn "osbf: rename $newf to $oldf failed: $!\n"; + } + } + } + } + + # Call untie_db() so we unlock correctly. + $self->untie_db(); + + return ($kept, $deleted, $num_hapaxes, $num_lowfreq); +} + +########################################################################### + +# Is a sync due? +sub sync_due { + my ($self) = @_; + + # don't bother doing old db versions + return 0 if ($self->{db_version} < $self->DB_VERSION); + + my $conf = $self->{osbf}->{main}->{conf}; + return 0 if ($conf->{bayes_journal_max_size} == 0); + + my @vars = $self->get_storage_variables(); + dbg("osbf: DB journal sync: last sync: ".$vars[7],'bayes','-1'); + + ## Ok, should we do a sync? + + # Not if the journal file doesn't exist, it's not a file, or it's 0 + # bytes long. + return 0 unless (stat($self->_get_journal_filename()) && -f _); + + # Yes if the file size is larger than the specified maximum size. + return 1 if (-s _ > $conf->{bayes_journal_max_size}); + + # Yes there has been a sync before, and if it's been at least a day + # since that sync. + return 1 if (($vars[7] > 0) && (time - $vars[7] > 86400)); + + # No, I guess not. + return 0; +} + +########################################################################### +# db_seen reading APIs + +sub seen_get { + my ($self, $msgid) = @_; + $self->{db_seen}->{$msgid}; +} + +sub seen_put { + my ($self, $msgid, $seen) = @_; + + if ($self->{osbf}->{main}->{learn_to_journal}) { + $self->defer_update ("m $seen $msgid"); + } + else { + $self->_seen_put_direct($msgid, $seen); + } +} +sub _seen_put_direct { + my ($self, $msgid, $seen) = @_; + $self->{db_seen}->{$msgid} = $seen; +} + +sub seen_delete { + my ($self, $msgid) = @_; + + if ($self->{osbf}->{main}->{learn_to_journal}) { + $self->defer_update ("m f $msgid"); + } + else { + $self->_seen_delete_direct($msgid); + } +} +sub _seen_delete_direct { + my ($self, $msgid) = @_; + delete $self->{db_seen}->{$msgid}; +} + +########################################################################### +# db reading APIs + +sub tok_get { + my ($self, $tok) = @_; + $self->tok_unpack ($self->{db_toks}->{$tok}); +} + +sub tok_get_all { + my ($self, @tokens) = @_; + + my @tokensdata; + foreach my $token (@tokens) { + my ($tok_spam, $tok_ham, $atime) = $self->tok_unpack($self->{db_toks}->{$token}); + push(@tokensdata, [$token, $tok_spam, $tok_ham, $atime]); + } + return \@tokensdata; +} + +# return the magic tokens in a specific order: +# 0: scan count base +# 1: number of spam +# 2: number of ham +# 3: number of tokens in db +# 4: last expire atime +# 5: oldest token in db atime +# 6: db version value +# 7: last journal sync +# 8: last atime delta +# 9: last expire reduction count +# 10: newest token in db atime +# +sub get_storage_variables { + my ($self) = @_; + my @values; + + my $db_ver = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN}; + + if (!$db_ver || $db_ver =~ /\D/) { $db_ver = 0; } + + if ($db_ver >= 2) { + my $DB2_LAST_ATIME_DELTA_MAGIC_TOKEN = "\015\001\007\011\003LASTATIMEDELTA"; + my $DB2_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE"; + my $DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE"; + my $DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN = "\015\001\007\011\003LASTJOURNALSYNC"; + my $DB2_NEWEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003NEWESTAGE"; + my $DB2_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM"; + my $DB2_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM"; + my $DB2_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS"; + my $DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE"; + my $DB2_RUNNING_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003RUNNINGEXPIRE"; + + @values = ( + 0, + $self->{db_toks}->{$DB2_NSPAM_MAGIC_TOKEN}, + $self->{db_toks}->{$DB2_NHAM_MAGIC_TOKEN}, + $self->{db_toks}->{$DB2_NTOKENS_MAGIC_TOKEN}, + $self->{db_toks}->{$DB2_LAST_EXPIRE_MAGIC_TOKEN}, + $self->{db_toks}->{$DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN}, + $db_ver, + $self->{db_toks}->{$DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN}, + $self->{db_toks}->{$DB2_LAST_ATIME_DELTA_MAGIC_TOKEN}, + $self->{db_toks}->{$DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN}, + $self->{db_toks}->{$DB2_NEWEST_TOKEN_AGE_MAGIC_TOKEN}, + ); + } + elsif ($db_ver == 0) { + my $DB0_NSPAM_MAGIC_TOKEN = '**NSPAM'; + my $DB0_NHAM_MAGIC_TOKEN = '**NHAM'; + my $DB0_OLDEST_TOKEN_AGE_MAGIC_TOKEN = '**OLDESTAGE'; + my $DB0_LAST_EXPIRE_MAGIC_TOKEN = '**LASTEXPIRE'; + my $DB0_NTOKENS_MAGIC_TOKEN = '**NTOKENS'; + my $DB0_SCANCOUNT_BASE_MAGIC_TOKEN = '**SCANBASE'; + + @values = ( + $self->{db_toks}->{$DB0_SCANCOUNT_BASE_MAGIC_TOKEN}, + $self->{db_toks}->{$DB0_NSPAM_MAGIC_TOKEN}, + $self->{db_toks}->{$DB0_NHAM_MAGIC_TOKEN}, + $self->{db_toks}->{$DB0_NTOKENS_MAGIC_TOKEN}, + $self->{db_toks}->{$DB0_LAST_EXPIRE_MAGIC_TOKEN}, + $self->{db_toks}->{$DB0_OLDEST_TOKEN_AGE_MAGIC_TOKEN}, + 0, + 0, + 0, + 0, + 0, + ); + } + elsif ($db_ver == 1) { + my $DB1_NSPAM_MAGIC_TOKEN = "\015\001\007\011\003NSPAM"; + my $DB1_NHAM_MAGIC_TOKEN = "\015\001\007\011\003NHAM"; + my $DB1_OLDEST_TOKEN_AGE_MAGIC_TOKEN = "\015\001\007\011\003OLDESTAGE"; + my $DB1_LAST_EXPIRE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIRE"; + my $DB1_NTOKENS_MAGIC_TOKEN = "\015\001\007\011\003NTOKENS"; + my $DB1_SCANCOUNT_BASE_MAGIC_TOKEN = "\015\001\007\011\003SCANBASE"; + + @values = ( + $self->{db_toks}->{$DB1_SCANCOUNT_BASE_MAGIC_TOKEN}, + $self->{db_toks}->{$DB1_NSPAM_MAGIC_TOKEN}, + $self->{db_toks}->{$DB1_NHAM_MAGIC_TOKEN}, + $self->{db_toks}->{$DB1_NTOKENS_MAGIC_TOKEN}, + $self->{db_toks}->{$DB1_LAST_EXPIRE_MAGIC_TOKEN}, + $self->{db_toks}->{$DB1_OLDEST_TOKEN_AGE_MAGIC_TOKEN}, + 1, + 0, + 0, + 0, + 0, + ); + } + + foreach (@values) { + if (!$_ || $_ =~ /\D/) { + $_ = 0; + } + } + + return @values; +} + +sub dump_db_toks { + my ($self, $template, $regex, @vars) = @_; + + while (my ($tok, $tokvalue) = each %{$self->{db_toks}}) { + next if ($tok =~ MAGIC_RE); # skip magic tokens + next if (defined $regex && ($tok !~ /$regex/o)); + + # We have the value already, so just unpack it. + my ($ts, $th, $atime) = $self->tok_unpack ($tokvalue); + + my $prob = $self->{osbf}->_compute_prob_for_token($tok, $vars[1], $vars[2], $ts, $th); + $prob ||= 0.5; + + my $encoded_tok = unpack("H*",$tok); + printf $template,$prob,$ts,$th,$atime,$encoded_tok; + } +} + +sub set_last_expire { + my ($self, $time) = @_; + $self->{db_toks}->{$LAST_EXPIRE_MAGIC_TOKEN} = time(); +} + +## Don't bother using get_magic_tokens here. This token should only +## ever exist when we're running expire, so we don't want to convert it if +## it's there and we're not expiring ... +sub get_running_expire_tok { + my ($self) = @_; + my $running = $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN}; + if (!$running || $running =~ /\D/) { return undef; } + return $running; +} + +sub set_running_expire_tok { + my ($self) = @_; + + # update the lock and running expire magic token + $self->{osbf}->{main}->{locker}->refresh_lock ($self->{locked_file}); + $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN} = time(); +} + +sub remove_running_expire_tok { + my ($self) = @_; + delete $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN}; +} + +########################################################################### + +# db abstraction: allow deferred writes, since we will be frequently +# writing while checking. + +sub tok_count_change { + my ($self, $ds, $dh, $tok, $atime) = @_; + + $atime = 0 unless defined $atime; + + if ($self->{bayes}->{main}->{learn_to_journal}) { + # we can't store the SHA1 binary value in the journal, so convert it + # to a printable value that can be converted back later + my $encoded_tok = unpack("H*",$tok); + $self->defer_update ("c $ds $dh $atime $encoded_tok"); + } else { + $self->tok_sync_counters ($ds, $dh, $atime, $tok); + } +} + +sub multi_tok_count_change { + my ($self, $ds, $dh, $tokens, $atime) = @_; + + $atime = 0 unless defined $atime; + + foreach my $tok (keys %{$tokens}) { + if ($self->{bayes}->{main}->{learn_to_journal}) { + # we can't store the SHA1 binary value in the journal, so convert it + # to a printable value that can be converted back later + my $encoded_tok = unpack("H*",$tok); + $self->defer_update ("c $ds $dh $atime $encoded_tok"); + } else { + $self->tok_sync_counters ($ds, $dh, $atime, $tok); + } + } +} + +sub nspam_nham_get { + my ($self) = @_; + my @vars = $self->get_storage_variables(); + ($vars[1], $vars[2]); +} + +sub nspam_nham_change { + my ($self, $ds, $dh) = @_; + + if ($self->{osbf}->{main}->{learn_to_journal}) { + $self->defer_update ("n $ds $dh"); + } else { + $self->tok_sync_nspam_nham ($ds, $dh); + } +} + +sub tok_touch { + my ($self, $tok, $atime) = @_; + # we can't store the SHA1 binary value in the journal, so convert it + # to a printable value that can be converted back later + my $encoded_tok = unpack("H*", $tok); + $self->defer_update ("t $atime $encoded_tok"); +} + +sub tok_touch_all { + my ($self, $tokens, $atime) = @_; + + foreach my $token (@{$tokens}) { + # we can't store the SHA1 binary value in the journal, so convert it + # to a printable value that can be converted back later + my $encoded_tok = unpack("H*", $token); + $self->defer_update ("t $atime $encoded_tok"); + } +} + +sub defer_update { + my ($self, $str) = @_; + $self->{string_to_journal} .= "$str\n"; +} + +########################################################################### + +sub cleanup { + my ($self) = @_; + + my $nbytes = length ($self->{string_to_journal}); + return if ($nbytes == 0); + + my $path = $self->_get_journal_filename(); + + # use append mode, write atomically, then close, so simultaneous updates are + # not lost + my $conf = $self->{osbf}->{main}->{conf}; + + # set the umask to the inverse of what we want ... + my $umask = umask(0777 - (oct ($conf->{osbf_file_mode}) & 0666)); + + if (!open (OUT, ">>".$path)) { + warn "osbf: cannot write to $path, bayes db update ignored: $!\n"; + umask $umask; # reset umask + return; + } + umask $umask; # reset umask + + # do not use print() here, it will break up the buffer if it's >8192 bytes, + # which could result in two sets of tokens getting mixed up and their + # touches missed. + my $write_failure = 0; + my $original_point = tell OUT; + my $len; + do { + $len = syswrite (OUT, $self->{string_to_journal}, $nbytes); + + # argh, write failure, give up + if (!defined $len || $len < 0) { + my $err = ''; + if (!defined $len) { + $len = 0; + $err = " ($!)"; + } + warn "osbf: write failed to OSBF journal $path ($len of $nbytes)!$err\n"; + last; + } + + # This shouldn't happen, but could if the fs is full... + if ($len != $nbytes) { + warn "osbf: partial write to bayes journal $path ($len of $nbytes), recovering\n"; + + # we want to be atomic, so revert the journal file back to where + # we know it's "good". if we can't truncate the journal, or we've + # tried 5 times to do the write, abort! + if (!truncate(OUT, $original_point) || ($write_failure++ > 4)) { + warn "osbf: cannot write to bayes journal $path, aborting!\n"; + last; + } + + # if the fs is full, let's give the system a break + sleep 1; + } + } while ($len != $nbytes); + + if (!close OUT) { + warn "osbf: cannot write to $path, bayes db update ignored\n"; + } + + $self->{string_to_journal} = ''; +} + +# Return a qr'd RE to match a token with the correct format's magic token +sub get_magic_re { + my ($self) = @_; + + if (!defined $self->{db_version} || $self->{db_version} >= 1) { + return MAGIC_RE; + } + + # When in doubt, assume v0 + return qr/^\*\*[A-Z]+$/; +} + +# provide a more generalized public interface into the journal sync + +sub sync { + my ($self, $opts) = @_; + + return $self->_sync_journal($opts); +} + +########################################################################### +# And this method reads the journal and applies the changes in one +# (locked) transaction. + +sub _sync_journal { + my ($self, $opts) = @_; + my $ret = 0; + + my $path = $self->_get_journal_filename(); + + # if $path doesn't exist, or it's not a file, or is 0 bytes in length, return + if (!stat($path) || !-f _ || -z _) { + return 0; + } + + my $eval_stat; + eval { + local $SIG{'__DIE__'}; # do not run user die() traps in here + if ($self->tie_db_writable()) { + $ret = $self->_sync_journal_trapped($opts, $path); + } + 1; + } or do { + $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; + }; + + # ok, untie from write-mode if we can + if (!$self->{osbf}->{main}->{learn_caller_will_untie}) { + $self->untie_db(); + } + + # handle any errors that may have occurred + if (defined $eval_stat) { + warn "osbf: $eval_stat\n"; + return 0; + } + + $ret; +} + +sub _sync_journal_trapped { + my ($self, $opts, $path) = @_; + + # Flag that we're doing work + $self->set_running_expire_tok(); + + my $started = time(); + my $count = 0; + my $total_count = 0; + my %tokens; + my $showdots = $opts->{showdots}; + my $retirepath = $path.".old"; + + # if $path doesn't exist, or it's not a file, or is 0 bytes in length, + # return we have to check again since the file may have been removed + # by a recent bayes db upgrade ... + if (!stat($path) || !-f _ || -z _) { + return 0; + } + + if (!-r $path) { # will we be able to read the file? + warn "osbf: bad permissions on journal, can't read: $path\n"; + return 0; + } + + # This is the critical phase (moving files around), so don't allow + # it to be interrupted. + { + local $SIG{'INT'} = 'IGNORE'; + local $SIG{'TERM'} = 'IGNORE'; + local $SIG{'HUP'} = 'IGNORE' if (!Mail::SpamAssassin::Util::am_running_on_windows()); + + # retire the journal, so we can update the db files from it in peace. + # TODO: use locking here + if (!rename ($path, $retirepath)) { + warn "osbf: failed rename $path to $retirepath\n"; + return 0; + } + + # now read the retired journal + if (!open (JOURNAL, "<$retirepath")) { + warn "osbf: cannot open read $retirepath\n"; + return 0; + } + + + # Read the journal + while () { + $total_count++; + + if (/^t (\d+) (.+)$/) { # Token timestamp update, cache resultant entries + my $tok = pack("H*",$2); + $tokens{$tok} = $1+0 if (!exists $tokens{$tok} || $1+0 > $tokens{$tok}); + } elsif (/^c (-?\d+) (-?\d+) (\d+) (.+)$/) { # Add/full token update + my $tok = pack("H*",$4); + $self->tok_sync_counters ($1+0, $2+0, $3+0, $tok); + $count++; + } elsif (/^n (-?\d+) (-?\d+)$/) { # update ham/spam count + $self->tok_sync_nspam_nham ($1+0, $2+0); + $count++; + } elsif (/^m ([hsf]) (.+)$/) { # update msgid seen database + if ($1 eq "f") { + $self->_seen_delete_direct($2); + } + else { + $self->_seen_put_direct($2,$1); + } + $count++; + } else { + warn "osbf: gibberish entry found in journal: $_"; + } + } + close JOURNAL; + + # Now that we've determined what tokens we need to update and their + # final values, update the DB. Should be much smaller than the full + # journal entries. + while (my ($k,$v) = each %tokens) { + $self->tok_touch_token ($v, $k); + + if ((++$count % 1000) == 0) { + if ($showdots) { print STDERR "."; } + $self->set_running_expire_tok(); + } + } + + if ($showdots) { print STDERR "\n"; } + + # we're all done, so unlink the old journal file + unlink ($retirepath) || warn "osbf: can't unlink $retirepath: $!\n"; + + $self->{db_toks}->{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $started; + + my $done = time(); + my $msg = ("osbf: synced databases from journal in " . + ($done - $started) . + " seconds: $count unique entries ($total_count total entries)"); + + if ($opts->{verbose}) { + print $msg,"\n"; + } else { + dbg($msg); + } + } + + # else, that's the lot, we're synced. return + return 1; +} + +sub tok_touch_token { + my ($self, $atime, $tok) = @_; + my ($ts, $th, $oldatime) = $self->tok_get ($tok); + + # If the new atime is < the old atime, ignore the update + # We figure that we'll never want to lower a token atime, so abort if + # we try. (journal out of sync, etc.) + return if ($oldatime >= $atime); + + $self->tok_put ($tok, $ts, $th, $atime); +} + +sub tok_sync_counters { + my ($self, $ds, $dh, $atime, $tok) = @_; + my ($ts, $th, $oldatime) = $self->tok_get ($tok); + $ts += $ds; if ($ts < 0) { $ts = 0; } + $th += $dh; if ($th < 0) { $th = 0; } + + # Don't roll the atime of tokens backwards ... + $atime = $oldatime if ($oldatime > $atime); + + $self->tok_put ($tok, $ts, $th, $atime); +} + +sub tok_put { + my ($self, $tok, $ts, $th, $atime) = @_; + $ts ||= 0; + $th ||= 0; + + # Ignore magic tokens, the don't go in this way ... + return if ($tok =~ MAGIC_RE); + + # use defined() rather than exists(); the latter is not supported + # by NDBM_File, believe it or not. Using defined() did not + # indicate any noticeable speed hit in my testing. (Mar 31 2003 jm) + my $exists_already = defined $self->{db_toks}->{$tok}; + + if ($ts == 0 && $th == 0) { + return if (!$exists_already); # If the token doesn't exist, just return + $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN}--; + delete $self->{db_toks}->{$tok}; + } else { + if (!$exists_already) { # If the token doesn't exist, raise the token count + $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN}++; + } + + $self->{db_toks}->{$tok} = $self->tok_pack ($ts, $th, $atime); + + my $newmagic = $self->{db_toks}->{$NEWEST_TOKEN_AGE_MAGIC_TOKEN}; + if (!defined ($newmagic) || $atime > $newmagic) { + $self->{db_toks}->{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $atime; + } + + # Make sure to check for either !defined or "" ... Apparently + # sometimes the DB module doesn't return the value correctly. :( + my $oldmagic = $self->{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN}; + if (!defined ($oldmagic) || $oldmagic eq "" || $atime < $oldmagic) { + $self->{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $atime; + } + } +} + +sub tok_sync_nspam_nham { + my ($self, $ds, $dh) = @_; + my ($ns, $nh) = ($self->get_storage_variables())[1,2]; + if ($ds) { $ns += $ds; } if ($ns < 0) { $ns = 0; } + if ($dh) { $nh += $dh; } if ($nh < 0) { $nh = 0; } + $self->{db_toks}->{$NSPAM_MAGIC_TOKEN} = $ns; + $self->{db_toks}->{$NHAM_MAGIC_TOKEN} = $nh; +} + +########################################################################### + +sub _get_journal_filename { + my ($self) = @_; + + my $main = $self->{osbf}->{main}; + return $main->sed_path($main->{conf}->{osbf_path}."_journal"); +} + +########################################################################### + +# this is called directly from sa-learn(1). +sub perform_upgrade { + my ($self, $opts) = @_; + my $ret = 0; + + my $eval_stat; + eval { + local $SIG{'__DIE__'}; # do not run user die() traps in here + + use File::Basename; + use File::Copy; + + # bayes directory + my $main = $self->{osbf}->{main}; + my $path = $main->sed_path($main->{conf}->{osbf_path}); + my $dir = dirname($path); + + # make temporary copy since old dbm and new dbm may have same name + opendir(DIR, $dir) || die "osbf: can't opendir $dir: $!"; + my @files = grep { /^osbf_(?:seen|toks)(?:\.\w+)?$/ } readdir(DIR); + closedir(DIR); + if (@files < 2 || !grep(/osbf_seen/,@files) || !grep(/osbf_toks/,@files)) + { + die "osbf: unable to find osbf_toks and osbf_seen, stopping\n"; + } + # untaint @files (already safe after grep) + untaint_var(\@files); + + for (@files) { + my $src = "$dir/$_"; + my $dst = "$dir/old_$_"; + copy($src, $dst) || die "osbf: can't copy $src to $dst: $!\n"; + } + + # delete previous to make way for import + for (@files) { unlink("$dir/$_"); } + + # import + if ($self->tie_db_writable()) { + $ret += $self->upgrade_old_dbm_files_trapped("$dir/old_osbf_seen", + $self->{db_seen}); + $ret += $self->upgrade_old_dbm_files_trapped("$dir/old_osbf_toks", + $self->{db_toks}); + } + + if ($ret == 2) { + print "import successful, original files saved with \"old\" prefix\n"; + } + else { + print "import failed, original files saved with \"old\" prefix\n"; + } + 1; + } or do { + $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; + }; + + $self->untie_db(); + + # if we died, untie the dbm files + if (defined $eval_stat) { + warn "osbf: perform_upgrade: $eval_stat\n"; + return 0; + } + $ret; +} + +sub upgrade_old_dbm_files_trapped { + my ($self, $filename, $output) = @_; + + my $count; + my %in; + + print "upgrading to DB_File, please be patient: $filename\n"; + + # try each type of file until we find one with > 0 entries + for my $dbm ('DB_File', 'GDBM_File', 'NDBM_File', 'SDBM_File') { + $count = 0; + # wrap in eval so it doesn't run in general use. This accesses db + # modules directly. + # Note: (bug 2390), the 'use' needs to be on the same line as the eval + # for RPM dependency checks to work properly. It's lame, but... + my $eval_stat; + eval 'use ' . $dbm . '; + tie %in, "' . $dbm . '", $filename, O_RDONLY, 0600; + %{ $output } = %in; + $count = scalar keys %{ $output }; + untie %in; + 1; + ' or do { + $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; + }; + if (defined $eval_stat) { + print "$dbm: $dbm module not installed(?), nothing copied: $eval_stat\n"; + dbg("osbf: error was: $eval_stat"); + } + elsif ($count == 0) { + print "$dbm: no database of that kind found, nothing copied\n"; + } + else { + print "$dbm: copied $count entries\n"; + return 1; + } + } + + return 0; +} + +sub clear_database { + my ($self) = @_; + + return 0 unless ($self->tie_db_writable()); + + dbg("osbf: untie-ing in preparation for removal."); + + foreach my $dbname (@DBNAMES) { + my $db_var = 'db_'.$dbname; + + if (exists $self->{$db_var}) { + # dbg("osbf: untie-ing $db_var"); + untie %{$self->{$db_var}}; + delete $self->{$db_var}; + } + } + + my $path = $self->{osbf}->{main}->sed_path($self->{osbf}->{main}->{conf}->{osbf_path}); + + foreach my $dbname (@DBNAMES, 'journal') { + foreach my $ext ($self->DB_EXTENSIONS) { + my $name = $path.'_'.$dbname.$ext; + my $ret = unlink $name; + dbg("osbf: clear_database: " . ($ret ? 'removed' : 'tried to remove') . " $name"); + } + } + + # the journal file needs to be done separately since it has no extension + foreach my $dbname ('journal') { + my $name = $path.'_'.$dbname; + my $ret = unlink $name; + dbg("osbf: clear_database: " . ($ret ? 'removed' : 'tried to remove') . " $name"); + } + + $self->untie_db(); + + return 1; +} + +sub backup_database { + my ($self) = @_; + + # we tie writable because we want the upgrade code to kick in if needed + return 0 unless ($self->tie_db_writable()); + + my @vars = $self->get_storage_variables(); + + print "v\t$vars[6]\tdb_version # this must be the first line!!!\n"; + print "v\t$vars[1]\tnum_spam\n"; + print "v\t$vars[2]\tnum_nonspam\n"; + + while (my ($tok, $packed) = each %{$self->{db_toks}}) { + next if ($tok =~ MAGIC_RE); # skip magic tokens + + my ($ts, $th, $atime) = $self->tok_unpack($packed); + my $encoded_token = unpack("H*",$tok); + print "t\t$ts\t$th\t$atime\t$encoded_token\n"; + } + + while (my ($msgid, $flag) = each %{$self->{db_seen}}) { + print "s\t$flag\t$msgid\n"; + } + + $self->untie_db(); + + return 1; +} + +sub restore_database { + my ($self, $filename, $showdots) = @_; + + if (!open(DUMPFILE, '<', $filename)) { + dbg("osbf: unable to open backup file $filename: $!"); + return 0; + } + + if (!$self->tie_db_writable()) { + dbg("osbf: failed to tie db writable"); + return 0; + } + + my $main = $self->{osbf}->{main}; + my $path = $main->sed_path($main->{conf}->{osbf_path}); + + # use a temporary PID-based suffix just in case another one was + # created previously by an interrupted expire + my $tmpsuffix = "convert$$"; + my $tmptoksdbname = $path.'_toks.'.$tmpsuffix; + my $tmpseendbname = $path.'_seen.'.$tmpsuffix; + my $toksdbname = $path.'_toks'; + my $seendbname = $path.'_seen'; + + my %new_toks; + my %new_seen; + my $umask = umask 0; + unless (tie %new_toks, $self->DBM_MODULE, $tmptoksdbname, O_RDWR|O_CREAT|O_EXCL, + (oct ($main->{conf}->{osbf_file_mode}) & 0666)) { + dbg("osbf: failed to tie temp toks db: $!"); + $self->untie_db(); + umask $umask; + return 0; + } + unless (tie %new_seen, $self->DBM_MODULE, $tmpseendbname, O_RDWR|O_CREAT|O_EXCL, + (oct ($main->{conf}->{osbf_file_mode}) & 0666)) { + dbg("osbf: failed to tie temp seen db: $!"); + untie %new_toks; + $self->_unlink_file($tmptoksdbname); + $self->untie_db(); + umask $umask; + return 0; + } + umask $umask; + + my $line_count = 0; + my $db_version; + my $token_count = 0; + my $num_spam; + my $num_ham; + my $error_p = 0; + my $newest_token_age = 0; + # Kinda wierd I know, but we need a nice big value and we know there will be + # no tokens > time() since we reset atime if > time(), so use that with a + # little buffer just in case. + my $oldest_token_age = time() + 100000; + + my $line = ; + $line_count++; + + # We require the database version line to be the first in the file so we can + # figure out how to properly deal with the file. If it is not the first + # line then fail + if ($line =~ m/^v\s+(\d+)\s+db_version/) { + $db_version = $1; + } + else { + dbg("osbf: database version must be the first line in the backup file, correct and re-run"); + untie %new_toks; + untie %new_seen; + $self->_unlink_file($tmptoksdbname); + $self->_unlink_file($tmpseendbname); + $self->untie_db(); + return 0; + } + + unless ($db_version == 2 || $db_version == 3) { + warn("osbf: database version $db_version is unsupported, must be version 2 or 3"); + untie %new_toks; + untie %new_seen; + $self->_unlink_file($tmptoksdbname); + $self->_unlink_file($tmpseendbname); + $self->untie_db(); + return 0; + } + + while (my $line = ) { + chomp($line); + $line_count++; + + if ($line_count % 1000 == 0) { + print STDERR "." if ($showdots); + } + + if ($line =~ /^v\s+/) { # variable line + my @parsed_line = split(/\s+/, $line, 3); + my $value = $parsed_line[1] + 0; + if ($parsed_line[2] eq 'num_spam') { + $num_spam = $value; + } + elsif ($parsed_line[2] eq 'num_nonspam') { + $num_ham = $value; + } + else { + dbg("osbf: restore_database: skipping unknown line: $line"); + } + } + elsif ($line =~ /^t\s+/) { # token line + my @parsed_line = split(/\s+/, $line, 5); + my $spam_count = $parsed_line[1] + 0; + my $ham_count = $parsed_line[2] + 0; + my $atime = $parsed_line[3] + 0; + my $token = $parsed_line[4]; + + my $token_warn_p = 0; + my @warnings; + + if ($spam_count < 0) { + $spam_count = 0; + push(@warnings, 'spam count < 0, resetting'); + $token_warn_p = 1; + } + if ($ham_count < 0) { + $ham_count = 0; + push(@warnings, 'ham count < 0, resetting'); + $token_warn_p = 1; + } + + if ($spam_count == 0 && $ham_count == 0) { + dbg("osbf: token has zero spam and ham count, skipping"); + next; + } + + if ($atime > time()) { + $atime = time(); + push(@warnings, 'atime > current time, resetting'); + $token_warn_p = 1; + } + + if ($token_warn_p) { + dbg("osbf: token ($token) has the following warnings:\n".join("\n",@warnings)); + } + + # database versions < 3 did not encode their token values + if ($db_version < 3) { + $token = substr(sha1($token), -5); + } + else { + # turn unpacked binary token back into binary value + $token = pack("H*",$token); + } + + $new_toks{$token} = $self->tok_pack($spam_count, $ham_count, $atime); + if ($atime < $oldest_token_age) { + $oldest_token_age = $atime; + } + if ($atime > $newest_token_age) { + $newest_token_age = $atime; + } + $token_count++; + } + elsif ($line =~ /^s\s+/) { # seen line + my @parsed_line = split(/\s+/, $line, 3); + my $flag = $parsed_line[1]; + my $msgid = $parsed_line[2]; + + unless ($flag eq 'h' || $flag eq 's') { + dbg("osbf: unknown seen flag ($flag) for line: $line, skipping"); + next; + } + + unless ($msgid) { + dbg("osbf: blank msgid for line: $line, skipping"); + next; + } + + $new_seen{$msgid} = $flag; + } + else { + dbg("osbf: skipping unknown line: $line"); + next; + } + } + close(DUMPFILE); + + print STDERR "\n" if ($showdots); + + unless (defined($num_spam)) { + dbg("osbf: unable to find num spam, please check file"); + $error_p = 1; + } + + unless (defined($num_ham)) { + dbg("osbf: unable to find num ham, please check file"); + $error_p = 1; + } + + if ($error_p) { + dbg("osbf: error(s) while attempting to load $filename, correct and re-run"); + + untie %new_toks; + untie %new_seen; + $self->_unlink_file($tmptoksdbname); + $self->_unlink_file($tmpseendbname); + $self->untie_db(); + return 0; + } + + # set the calculated magic tokens + $new_toks{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION(); + $new_toks{$NTOKENS_MAGIC_TOKEN} = $token_count; + $new_toks{$NSPAM_MAGIC_TOKEN} = $num_spam; + $new_toks{$NHAM_MAGIC_TOKEN} = $num_ham; + $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $newest_token_age; + $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $oldest_token_age; + + # go ahead and zero out these, chances are good that they are bogus anyway. + $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = 0; + $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = 0; + $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0; + $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0; + + local $SIG{'INT'} = 'IGNORE'; + local $SIG{'TERM'} = 'IGNORE'; + local $SIG{'HUP'} = 'IGNORE' if (!Mail::SpamAssassin::Util::am_running_on_windows()); + + untie %new_toks; + untie %new_seen; + $self->untie_db(); + + # Here is where something can go horribly wrong and screw up the bayes + # database files. If we are able to copy one and not the other then it + # will leave the database in an inconsistent state. Since this is an + # edge case, and they're trying to replace the DB anyway we should be ok. + unless ($self->_rename_file($tmptoksdbname, $toksdbname)) { + dbg("osbf: error while renaming $tmptoksdbname to $toksdbname: $!"); + return 0; + } + unless ($self->_rename_file($tmpseendbname, $seendbname)) { + dbg("osbf: error while renaming $tmpseendbname to $seendbname: $!"); + dbg("osbf: database now in inconsistent state"); + return 0; + } + + dbg("osbf: parsed $line_count lines"); + dbg("osbf: created database with $token_count tokens based on $num_spam spam messages and $num_ham ham messages"); + + return 1; +} + +########################################################################### + +# token marshalling format for db_toks. + +# Since we may have many entries with few hits, especially thousands of hapaxes +# (1-occurrence entries), use a flexible entry format, instead of simply "2 +# packed ints", to keep the memory and disk space usage down. In my +# 18k-message test corpus, only 8.9% have >= 8 hits in either counter, so we +# can use a 1-byte representation for the other 91% of low-hitting entries +# and save masses of space. + +# This looks like: XXSSSHHH (XX = format bits, SSS = 3 spam-count bits, HHH = 3 +# ham-count bits). If XX in the first byte is 11, it's packed as this 1-byte +# representation; otherwise, if XX in the first byte is 00, it's packed as +# "CLL", ie. 1 byte and 2 32-bit "longs" in perl pack format. + +# Savings: roughly halves size of toks db, at the cost of a ~10% slowdown. + +use constant FORMAT_FLAG => 0xc0; # 11000000 +use constant ONE_BYTE_FORMAT => 0xc0; # 11000000 +use constant TWO_LONGS_FORMAT => 0x00; # 00000000 + +use constant ONE_BYTE_SSS_BITS => 0x38; # 00111000 +use constant ONE_BYTE_HHH_BITS => 0x07; # 00000111 + +sub tok_unpack { + my ($self, $value) = @_; + $value ||= 0; + + my ($packed, $atime) = unpack("CV", $value); + + if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) { + return (($packed & ONE_BYTE_SSS_BITS) >> 3, + $packed & ONE_BYTE_HHH_BITS, + $atime || 0); + } + elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) { + my ($packed, $ts, $th, $atime) = unpack("CVVV", $value); + return ($ts || 0, $th || 0, $atime || 0); + } + # other formats would go here... + else { + warn "osbf: unknown packing format for bayes db, please re-learn: $packed"; + return (0, 0, 0); + } +} + +sub tok_pack { + my ($self, $ts, $th, $atime) = @_; + $ts ||= 0; $th ||= 0; $atime ||= 0; + if ($ts < 8 && $th < 8) { + return pack ("CV", ONE_BYTE_FORMAT | ($ts << 3) | $th, $atime); + } else { + return pack ("CVVV", TWO_LONGS_FORMAT, $ts, $th, $atime); + } +} + +########################################################################### + +sub db_readable { + my ($self) = @_; + return $self->{already_tied}; +} + +sub db_writable { + my ($self) = @_; + return $self->{already_tied} && $self->{is_locked}; +} + +########################################################################### + +sub _unlink_file { + my ($self, $filename) = @_; + + unlink $filename; +} + +sub _rename_file { + my ($self, $sourcefilename, $targetfilename) = @_; + + return 0 unless (rename($sourcefilename, $targetfilename)); + + return 1; +} + +sub sa_die { Mail::SpamAssassin::sa_die(@_); } + +1; Index: lib/Mail/SpamAssassin/Plugin/OSBF.pm =================================================================== --- lib/Mail/SpamAssassin/Plugin/OSBF.pm (.../trunk) (revision 0) +++ lib/Mail/SpamAssassin/Plugin/OSBF.pm (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -0,0 +1,1372 @@ +# <@LICENSE> +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to you under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +=head1 NAME + +Mail::SpamAssassin::Plugin::OSBF - OSBF learning classifier + +=head1 DESCRIPTION + +This plugin implements a trained probabilistic classifier, using an algorithm +based on Winnow, as described in section 2 of _Combining Winnow and Orthogonal +Sparse Bigrams for Incremental Spam Filtering_, by Siefkes, Assis, Chhabra and +Yerazunis: + + http://www.siefkes.net/ie/winnow-spam.pdf + http://en.wikipedia.org/wiki/Winnow + +The tokenizer uses Orthogonal Sparse Bigrams, as described in that paper. + +The results are incorporated into SpamAssassin as the OSBF_* rules. + +=head1 METHODS + +=over 4 + +=cut + +package Mail::SpamAssassin::Plugin::OSBF; + +use strict; +use warnings; +use bytes; +use re 'taint'; + +use Mail::SpamAssassin; +use Mail::SpamAssassin::Plugin; +use Mail::SpamAssassin::PerMsgStatus; +use Mail::SpamAssassin::Logger; +use Mail::SpamAssassin::Util qw(untaint_var); + +use Mail::SpamAssassin::Bayes::CombineNaiveBayes; + +use Digest::SHA1 qw(sha1 sha1_hex); + +our @ISA = qw(Mail::SpamAssassin::Plugin); + +use vars qw{ + $IGNORED_HDRS + $MARK_PRESENCE_ONLY_HDRS + $OPPORTUNISTIC_LOCK_VALID +}; + +# Which headers should we scan for tokens? Don't use all of them, as it's easy +# to pick up spurious clues from some. What we now do is use all of them +# *less* these well-known headers; that way we can pick up spammers' tracking +# headers (which are obviously not well-known in advance!). + +# Received is handled specially +$IGNORED_HDRS = qr{(?: (?:X-)?Sender # misc noise + |Delivered-To |Delivery-Date + |(?:X-)?Envelope-To + |X-MIME-Auto[Cc]onverted |X-Converted-To-Plain-Text + + |Subject # not worth a tiny gain vs. to db size increase + + # Date: can provide invalid cues if your spam corpus is + # older/newer than ham + |Date + + # List headers: ignore. a spamfiltering mailing list will + # become a nonspam sign. + |X-List|(?:X-)?Mailing-List + |(?:X-)?List-(?:Archive|Help|Id|Owner|Post|Subscribe + |Unsubscribe|Host|Id|Manager|Admin|Comment + |Name|Url) + |X-Unsub(?:scribe)? + |X-Mailman-Version |X-Been[Tt]here |X-Loop + |Mail-Followup-To + |X-eGroups-(?:Return|From) + |X-MDMailing-List + |X-XEmacs-List + + # gatewayed through mailing list (thanks to Allen Smith) + |(?:X-)?Resent-(?:From|To|Date) + |(?:X-)?Original-(?:From|To|Date) + + # Spamfilter/virus-scanner headers: too easy to chain from + # these + |X-MailScanner(?:-SpamCheck)? + |X-Spam(?:-(?:Status|Level|Flag|Report|Hits|Score|Checker-Version))? + |X-Antispam |X-RBL-Warning |X-Mailscanner + |X-MDaemon-Deliver-To |X-Virus-Scanned + |X-Mass-Check-Id + |X-Pyzor |X-DCC-\S{2,25}-Metrics + |X-Filtered-B[Yy] |X-Scanned-By |X-Scanner + |X-AP-Spam-(?:Score|Status) |X-RIPE-Spam-Status + |X-SpamCop-[^:]+ + |X-SMTPD |(?:X-)?Spam-Apparently-To + |SPAM |X-Perlmx-Spam + |X-Bogosity + + # some noisy Outlook headers that add no good clues: + |Content-Class |Thread-(?:Index|Topic) + |X-Original[Aa]rrival[Tt]ime + + # Annotations from IMAP, POP, and MH: + |(?:X-)?Status |X-Flags |Replied |Forwarded + |Lines |Content-Length + |X-UIDL? |X-IMAPbase + + # Annotations from Bugzilla + |X-Bugzilla-[^:]+ + + # Annotations from VM: (thanks to Allen Smith) + |X-VM-(?:Bookmark|(?:POP|IMAP)-Retrieved|Labels|Last-Modified + |Summary-Format|VHeader|v\d-Data|Message-Order) + + # Annotations from Gnus: + | X-Gnus-Mail-Source + | Xref + +)}x; + +# Note only the presence of these headers, in order to reduce the +# hapaxen they generate. +$MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face + |X-(?:Gnu-?PG|PGP|GPG)(?:-Key)?-Fingerprint +)}ix; + +# tweaks tested as of Nov 18 2002 by jm: see SpamAssassin-devel list archives +# for results. The winners are now the default settings. +use constant IGNORE_TITLE_CASE => 1; +use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 1; +use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; + +# tweaks of May 12 2003, see SpamAssassin-devel archives again. +use constant PRE_CHEW_ADDR_HEADERS => 1; +use constant CHEW_BODY_URIS => 1; +use constant CHEW_BODY_MAILADDRS => 1; +use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; +use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; +use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0; +use constant IGNORE_MSGID_TOKENS => 0; + +# tweaks of 12 March 2004, see bug 2129. +use constant DECOMPOSE_BODY_TOKENS => 1; +use constant MAP_HEADERS_MID => 1; +use constant MAP_HEADERS_FROMTOCC => 1; +use constant MAP_HEADERS_USERAGENT => 1; + +# tweaks, see http://issues.apache.org/SpamAssassin/show_bug.cgi?id=3173#c26 +use constant ADD_INVIZ_TOKENS_I_PREFIX => 1; +use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0; + +# How many seconds should the opportunistic_expire lock be valid? +$OPPORTUNISTIC_LOCK_VALID = 300; + +# How many of the most significant tokens should we use for the p(w) +# calculation? +use constant N_SIGNIFICANT_TOKENS => 999; + +# How many significant tokens are required for a classifier score to +# be considered usable? +use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1; + +########################################################################### + +sub new { + my $class = shift; + my ($main) = @_; + + $class = ref($class) || $class; + my $self = $class->SUPER::new($main); + bless ($self, $class); + + $self->{main} = $main; + $self->{conf} = $main->{conf}; + $self->{use_ignores} = 1; + + $self->register_eval_rule("check_osbf"); + $self; +} + +sub finish { + my $self = shift; + if ($self->{store}) { + $self->{store}->untie_db(); + } + %{$self} = (); +} + +# Plugin hook. +# Return this implementation object, for callers that need to know +# it. TODO: callers shouldn't *need* to know it! +# +# used in test suite to get access to {store}, internal APIs; +# used in Mail::SpamAssassin::PerMsgStatus for the +# compute_declassification_distance() call. +# +sub learner_get_implementation { + my ($self) = @_; + return $self; +} + +########################################################################### + +sub check_osbf { + my ($self, $pms, $fulltext, $min, $max) = @_; + + return 0 if (!$pms->{conf}->{use_learner}); + return 0 if (!$pms->{conf}->{use_bayes} || !$pms->{conf}->{use_bayes_rules}); + + # TODO: osbf_score? + + if (!exists ($pms->{bayes_score})) { + my $timer = $self->{main}->time_method("check_osbf"); + $pms->{bayes_score} = $self->scan($pms, $pms->{msg}); + } + + if (defined $pms->{bayes_score} && + ($min == 0 || $pms->{bayes_score} > $min) && + ($max eq "undef" || $pms->{bayes_score} <= $max)) + { + if ($pms->{conf}->{detailed_bayes_score}) { + $pms->test_log(sprintf ("score: %3.4f, hits: %s", + $pms->{bayes_score}, + $pms->{bayes_hits})); + } + else { + $pms->test_log(sprintf ("score: %3.4f", $pms->{bayes_score})); + } + return 1; + } + + return 0; +} + +########################################################################### + +# Plugin hook. +sub learner_close { + my ($self, $params) = @_; + if ($self->{store}->db_readable()) { $self->{store}->untie_db(); } +} + +########################################################################### + +sub ignore_message { + my ($self,$PMS) = @_; + + return 0 unless $self->{use_ignores}; + + my $ig_from = $self->{main}->call_plugins ("check_wb_list", + { permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' }); + my $ig_to = $self->{main}->call_plugins ("check_wb_list", + { permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' }); + + my $ignore = $ig_from || $ig_to; + + dbg("osbf: not using bayes, bayes_ignore_from or _to rule") if $ignore; + + return $ignore; +} + +########################################################################### + +# Plugin hook. +sub learn_message { + my ($self, $params) = @_; + my $isspam = $params->{isspam}; + my $msg = $params->{msg}; + my $id = $params->{id}; + + if (!$self->{conf}->{use_bayes}) { return; } + + # Winnow cannot support learning to journal + $self->{main}->{learn_to_journal} = 0; + + my $msgdata = $self->get_body_from_msg ($msg); + my $ret; + + eval { + local $SIG{'__DIE__'}; # do not run user die() traps in here + + my $ok; + if ($self->{main}->{learn_to_journal}) { + # If we're going to learn to journal, we'll try going r/o first... + # If that fails for some reason, let's try going r/w. This happens + # if the DB doesn't exist yet. + $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable(); + } else { + $ok = $self->{store}->tie_db_writable(); + } + + if ($ok) { + $ret = $self->_learn_trapped ($isspam, $msg, $msgdata, $id); + + if (!$self->{main}->{learn_caller_will_untie}) { + $self->{store}->untie_db(); + } + } + 1; + } or do { # if we died, untie the dbs. + my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; + $self->{store}->untie_db(); + die "osbf: (in learn) $eval_stat\n"; + }; + + return $ret; +} + +# this function is trapped by the wrapper above +sub _learn_trapped { + my ($self, $isspam, $msg, $msgdata, $msgid) = @_; + my @msgid = ( $msgid ); + + if (!defined $msgid) { + @msgid = $self->get_msgid($msg); + } + + foreach $msgid ( @msgid ) { + my $seen = $self->{store}->seen_get ($msgid); + + if (defined ($seen)) { + if (($seen =~ /^s/ && $isspam) || ($seen =~ /^h/ && !$isspam)) { + dbg("osbf: $msgid already learnt correctly, not learning twice"); + return 0; + } elsif ($seen !~ /^[hs]/) { + warn("osbf: db_seen corrupt: value='$seen' for $msgid, ignored"); + } else { + # bug 3704: If the message was already learned, don't try learning it again. + # this prevents, for instance, manually learning as spam, then autolearning + # as ham, or visa versa. + if ($self->{main}->{learn_no_relearn}) { + dbg("osbf: $msgid already learnt as opposite, not re-learning"); + return 0; + } + + dbg("osbf: $msgid already learnt as opposite, forgetting first"); + + # kluge so that forget() won't untie the db on us ... + my $orig = $self->{main}->{learn_caller_will_untie}; + $self->{main}->{learn_caller_will_untie} = 1; + + my $fatal = !defined $self->{main}->{bayes_scanner}->forget ($msg); + + # reset the value post-forget() ... + $self->{main}->{learn_caller_will_untie} = $orig; + + # forget() gave us a fatal error, so propagate that up + if ($fatal) { + dbg("osbf: forget() returned a fatal error, so learn() will too"); + return; + } + } + + # we're only going to have seen this once, so stop if it's been + # seen already + last; + } + } + + # Now that we're sure we haven't seen this message before ... + $msgid = $msgid[0]; + + if ($isspam) { + $self->{store}->nspam_nham_change (1, 0); + } else { + $self->{store}->nspam_nham_change (0, 1); + } + + my $msgatime = $msg->receive_date(); + + # If the message atime comes back as being more than 1 day in the + # future, something's messed up and we should revert to current time as + # a safety measure. + # + $msgatime = time if ( $msgatime - time > 86400 ); + + # we don't use the weights when training + my $tokens = $self->tokenize($msg, $msgdata); + + if ($isspam) { + $self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime); + } else { + $self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime); + } + + $self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h')); + $self->{store}->cleanup(); + + $self->{main}->call_plugins("bayes_learn", { toksref => $tokens, + isspam => $isspam, + msgid => $msgid, + msgatime => $msgatime, + }); + + dbg("osbf: learned '$msgid', atime: $msgatime"); + + 1; +} + +########################################################################### + +# Plugin hook. +sub forget_message { + my ($self, $params) = @_; + my $msg = $params->{msg}; + my $id = $params->{id}; + + if (!$self->{conf}->{use_bayes}) { return; } + + my $msgdata = $self->get_body_from_msg ($msg); + my $ret; + + # we still tie for writing here, since we write to the seen db + # synchronously + eval { + local $SIG{'__DIE__'}; # do not run user die() traps in here + + my $ok; + if ($self->{main}->{learn_to_journal}) { + # If we're going to learn to journal, we'll try going r/o first... + # If that fails for some reason, let's try going r/w. This happens + # if the DB doesn't exist yet. + $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable(); + } else { + $ok = $self->{store}->tie_db_writable(); + } + + if ($ok) { + $ret = $self->_forget_trapped ($msg, $msgdata, $id); + + if (!$self->{main}->{learn_caller_will_untie}) { + $self->{store}->untie_db(); + } + } + 1; + } or do { # if we died, untie the dbs. + my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; + $self->{store}->untie_db(); + die "osbf: (in forget) $eval_stat\n"; + }; + + return $ret; +} + +# this function is trapped by the wrapper above +sub _forget_trapped { + my ($self, $msg, $msgdata, $msgid) = @_; + my @msgid = ( $msgid ); + my $isspam; + + if (!defined $msgid) { + @msgid = $self->get_msgid($msg); + } + + while( $msgid = shift @msgid ) { + my $seen = $self->{store}->seen_get ($msgid); + + if (defined ($seen)) { + if ($seen eq 's') { + $isspam = 1; + } elsif ($seen eq 'h') { + $isspam = 0; + } else { + dbg("osbf: forget: msgid $msgid seen entry is neither ham nor spam, ignored"); + return 0; + } + + # messages should only be learned once, so stop if we find a msgid + # which was seen before + last; + } + else { + dbg("osbf: forget: msgid $msgid not learnt, ignored"); + } + } + + # This message wasn't learnt before, so return + if (!defined $isspam) { + dbg("osbf: forget: no msgid from this message has been learnt, skipping message"); + return 0; + } + elsif ($isspam) { + $self->{store}->nspam_nham_change (-1, 0); + } + else { + $self->{store}->nspam_nham_change (0, -1); + } + + # we don't use the weights when training + my $tokens = $self->tokenize($msg, $msgdata); + + if ($isspam) { + $self->{store}->multi_tok_count_change (-1, 0, $tokens); + } else { + $self->{store}->multi_tok_count_change (0, -1, $tokens); + } + + $self->{store}->seen_delete ($msgid); + $self->{store}->cleanup(); + + $self->{main}->call_plugins("bayes_forget", { toksref => $tokens, + isspam => $isspam, + msgid => $msgid, + }); + + 1; +} + +########################################################################### + +# Plugin hook. +sub learner_sync { + my ($self, $params) = @_; + if (!$self->{conf}->{use_bayes}) { return 0; } + dbg("osbf: osbf journal sync starting"); + $self->{store}->sync($params); + dbg("osbf: osbf journal sync completed"); +} + +########################################################################### + +# Plugin hook. +sub learner_expire_old_training { + my ($self, $params) = @_; + if (!$self->{conf}->{use_bayes}) { return 0; } + dbg("osbf: expiry starting"); + $self->{store}->expire_old_tokens($params); + dbg("osbf: expiry completed"); +} + +########################################################################### + +# Plugin hook. +# Check to make sure we can tie() the DB, and we have enough entries to do a scan +# if we're told the caller will untie(), go ahead and leave the db tied. +sub learner_is_scan_available { + my ($self, $params) = @_; + + return 0 unless $self->{conf}->{use_bayes}; + return 0 unless $self->{store}->tie_db_readonly(); + + # We need the DB to stay tied, so if the journal sync occurs, don't untie! + my $caller_untie = $self->{main}->{learn_caller_will_untie}; + $self->{main}->{learn_caller_will_untie} = 1; + + # Do a journal sync if necessary. Do this before the nspam_nham_get() + # call since the sync may cause an update in the number of messages + # learnt. + $self->_opportunistic_calls(1); + + # Reset the variable appropriately + $self->{main}->{learn_caller_will_untie} = $caller_untie; + + my ($ns, $nn) = $self->{store}->nspam_nham_get(); + + if ($ns < $self->{conf}->{bayes_min_spam_num}) { + dbg("osbf: not available for scanning, only $ns spam(s) in osbf DB < ".$self->{conf}->{bayes_min_spam_num}); + if (!$self->{main}->{learn_caller_will_untie}) { + $self->{store}->untie_db(); + } + return 0; + } + if ($nn < $self->{conf}->{bayes_min_ham_num}) { + dbg("osbf: not available for scanning, only $nn ham(s) in osbf DB < ".$self->{conf}->{bayes_min_ham_num}); + if (!$self->{main}->{learn_caller_will_untie}) { + $self->{store}->untie_db(); + } + return 0; + } + + return 1; +} + +########################################################################### + +sub scan { + my ($self, $permsgstatus, $msg) = @_; + my $score; + + return unless $self->{conf}->{use_learner}; + + # When we're doing a scan, we'll guarantee that we'll do the untie, + # so override the global setting until we're done. + my $caller_untie = $self->{main}->{learn_caller_will_untie}; + $self->{main}->{learn_caller_will_untie} = 1; + + goto skip if ($self->{main}->{bayes_scanner}->ignore_message($permsgstatus)); + goto skip unless $self->learner_is_scan_available(); + + my ($ns, $nn) = $self->{store}->nspam_nham_get(); + dbg("osbf: corpus size: nspam = $ns, nham = $nn"); + + my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus); + my ($msgtokens, $tokweights) = $self->tokenize($msg, $msgdata); + my $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens}); + + ## my $keep_pw = $self->{main}->have_plugin("bayes_scan"); + my %pw; + + foreach my $tokendata (@{$tokensdata}) { + my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata}; + + my $prob = $self->_compute_prob_for_token($token, $ns, $nn, + $tok_spam, $tok_ham, $tokweights->{$token}); +# dbg("osbf: JMD token '$msgtokens->{$token}' => ".(defined $prob ? (sprintf "%3.5f", $prob) : '??')); + next unless defined $prob; + + $pw{$token} = { + prob => $prob, + spam_count => $tok_spam, + ham_count => $tok_ham, + atime => $atime + }; + } + + # If none of the tokens were found in the DB, we're going to skip + # this message... + if (!keys %pw) { + dbg("bayes: cannot use osbf on this message; none of the tokens were found in the database"); + goto skip; + } + + my $tcount_total = keys %{$msgtokens}; + my $tcount_learned = keys %pw; + + # Figure out the message receive time (used as atime below) + # If the message atime comes back as being in the future, something's + # messed up and we should revert to current time as a safety measure. + # + my $msgatime = $msg->receive_date(); + my $now = time; + $msgatime = $now if ( $msgatime > $now ); + + # now take the $count most significant tokens and calculate probs using + # Robinson's formula. + my $count = N_SIGNIFICANT_TOKENS; + my @sorted; + + my @touch_tokens; + my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = []; + my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = []; + + my %tok_strength = map { $_ => (abs($pw{$_}->{prob} - 0.5)) } keys %pw; + my $log_each_token = (would_log('dbg', 'osbf') > 1); + + foreach my $tok ( + sort { $tok_strength{$b} <=> $tok_strength{$a} } + keys %pw) + { + if ($count-- < 0) { last; } + # next if ($tok_strength{$tok} < + # $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH); + + my $pw = $pw{$tok}->{prob}; + + # What's more expensive, scanning headers for HAMMYTOKENS and + # SPAMMYTOKENS tags that aren't there or collecting data that + # won't be used? Just collecting the data is certainly simpler. + # + my $raw_token = $msgtokens->{$tok} || "(unknown)"; + my $s = $pw{$tok}->{spam_count}; + my $n = $pw{$tok}->{ham_count}; + my $a = $pw{$tok}->{atime}; + + if (0) { # TODO + if ($pw < 0.5) { + push @$tinfo_hammy, [$raw_token,$pw,$s,$n,$a]; + } else { + push @$tinfo_spammy, [$raw_token,$pw,$s,$n,$a]; + } + } + + push (@sorted, $pw); + + # update the atime on this token, it proved useful + push(@touch_tokens, $tok); + + # dbg("osbf: JMD token '$raw_token' => ".(sprintf "%3.5f", $pw)); + if ($log_each_token) { + dbg("osbf: token '$raw_token' => ".(sprintf "%3.5f", $pw)); + } + } + + if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 && + $#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE)) + { + dbg("osbf: cannot use osbf on this message; not enough usable tokens found"); + goto skip; + } + + $score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted); + + # Couldn't come up with a probability? + goto skip unless defined $score; + + # HACK HACK scale to a better range, since EDDC seems to bias low + $score *= 1.1; $score = 1.0 if $score > 1.0; + + dbg("osbf: score = $score"); + + # no need to call tok_touch_all unless there were significant + # tokens and a score was returned + # we don't really care about the return value here + $self->{store}->tok_touch_all(\@touch_tokens, $msgatime); + + $permsgstatus->{bayes_nspam} = $ns; + $permsgstatus->{bayes_nham} = $nn; + + $self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens, + probsref => \%pw, + score => $score, + msgatime => $msgatime, + significant_tokens => \@touch_tokens, + }); + +skip: + if (!defined $score) { + dbg("osbf: not scoring message, returning undef"); + } + + # Take any opportunistic actions we can take + if ($self->{main}->{opportunistic_expire_check_only}) { + # we're supposed to report on expiry only -- so do the + # _opportunistic_calls() run for the journal only. + $self->_opportunistic_calls(1); + $permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due(); + } + else { + $self->_opportunistic_calls(); + } + + # Do any cleanup we need to do + $self->{store}->cleanup(); + + # Reset the value accordingly + $self->{main}->{learn_caller_will_untie} = $caller_untie; + + # If our caller won't untie the db, we need to do it. + if (!$caller_untie) { + $self->{store}->untie_db(); + } + + $permsgstatus->{tag_data}{BAYESTCHAMMY} = + ($tinfo_hammy ? scalar @{$tinfo_hammy} : 0); + $permsgstatus->{tag_data}{BAYESTCSPAMMY} = + ($tinfo_spammy ? scalar @{$tinfo_spammy} : 0); + $permsgstatus->{tag_data}{BAYESTCLEARNED} = $tcount_learned; + $permsgstatus->{tag_data}{BAYESTC} = $tcount_total; + + return $score; +} + +########################################################################### + +# Plugin hook. +sub learner_dump_database { + my ($self, $params) = @_; + my $magic = $params->{magic}; + my $toks = $params->{toks}; + my $regex = $params->{regex}; + + # allow dump to occur even if use_bayes disables everything else ... + #return 0 unless $self->{conf}->{use_bayes}; + return 0 unless $self->{store}->tie_db_readonly(); + + my @vars = $self->{store}->get_storage_variables(); + + my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars; + + my $template = '%3.3f %10u %10u %10u %s'."\n"; + + if ( $magic ) { + printf ($template, 0.0, 0, $bv, 0, 'non-token data: osbf db version'); + printf ($template, 0.0, 0, $ns, 0, 'non-token data: nspam'); + printf ($template, 0.0, 0, $nh, 0, 'non-token data: nham'); + printf ($template, 0.0, 0, $nt, 0, 'non-token data: ntokens'); + printf ($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime'); + printf ($template, 0.0, 0, $na, 0, 'non-token data: newest atime') if ( $bv >= 2 ); + printf ($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count') if ( $bv < 2 ); + printf ($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime') if ( $bv >= 2 ); + printf ($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime'); + if ( $bv >= 2 ) { + printf ($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta'); + printf ($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count'); + } + } + + if ( $toks ) { + # let the store sort out the db_toks + $self->{store}->dump_db_toks($template, $regex, @vars); + } + + if (!$self->{main}->{learn_caller_will_untie}) { + $self->{store}->untie_db(); + } + return 1; +} + +########################################################################### +# TODO: these are NOT public, but the test suite needs to call them. + +sub get_msgid { + my ($self, $msg) = @_; + + my @msgid; + + my $msgid = $msg->get_header("Message-Id"); + if (defined $msgid && $msgid ne '' && $msgid !~ /^\s*<\s*(?:\@sa_generated)?>.*$/) { + # remove \r and < and > prefix/suffixes + chomp $msgid; + $msgid =~ s/^.*$//g; + push(@msgid, $msgid); + } + + # Use sha1_hex(Date:, last received: and top N bytes of body) + # where N is MIN(1024 bytes, 1/2 of body length) + # + my $date = $msg->get_header("Date"); + $date = "None" if (!defined $date || $date eq ''); # No Date? + + my @rcvd = $msg->get_header("Received"); + my $rcvd = $rcvd[$#rcvd]; + $rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received? + + # Make a copy since pristine_body is a reference ... + my $body = join('', $msg->get_pristine_body()); + if (length($body) > 64) { # Small Body? + my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) ); + substr($body, $keep) = ''; + } + + unshift(@msgid, sha1_hex($date."\000".$rcvd."\000".$body).'@sa_generated'); + + return wantarray ? @msgid : $msgid[0]; +} + +sub get_body_from_msg { + my ($self, $msg) = @_; + + if (!ref $msg) { + # I have no idea why this seems to happen. TODO + warn "osbf: msg not a ref: '$msg'"; + return { }; + } + + my $permsgstatus = + Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg); + $msg->extract_message_metadata ($permsgstatus); + my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus); + $permsgstatus->finish(); + + if (!defined $msgdata) { + # why?! + warn "osbf: failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n"; + return { }; + } + + return $msgdata; +} + +sub _get_msgdata_from_permsgstatus { + my ($self, $msg) = @_; + + my $msgdata = { }; + $msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array(); + $msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array(); + @{$msgdata->{bayes_token_uris}} = $msg->get_uri_list(); + return $msgdata; +} + +########################################################################### + +# The calling functions expect a uniq'ed array of tokens ... +sub tokenize { + my ($self, $msg, $msgdata) = @_; + + # the body + my @tokens = map { $self->_tokenize_line ($_, '', 1) } + @{$msgdata->{bayes_token_body}}; + + # the URI list + push (@tokens, map { $self->_tokenize_line ($_, '', 2) } + @{$msgdata->{bayes_token_uris}}); + + # add invisible tokens + if (ADD_INVIZ_TOKENS_I_PREFIX) { + push (@tokens, map { $self->_tokenize_line ($_, "I*:", 1) } + @{$msgdata->{bayes_token_inviz}}); + } + if (ADD_INVIZ_TOKENS_NO_PREFIX) { + push (@tokens, map { $self->_tokenize_line ($_, "", 1) } + @{$msgdata->{bayes_token_inviz}}); + } + + # Tokenize the headers + my %hdrs = $self->_tokenize_headers ($msg); + while( my($prefix, $value) = each %hdrs ) { + push(@tokens, $self->_tokenize_line ($value, "H$prefix:", 0)); + } + + # Go ahead and uniq the array, skip null tokens (can happen sometimes) + # generate an SHA1 hash and take the lower 40 bits as our token + my %tokens; + my %weights; + foreach my $token (@tokens) { + my $distance = '0'; # OSB bigram token distance; default to 0 + if ($token && $token =~ s/^([0-5])://) { + $distance = $1; + } + + next unless length($token); # skip still 0-length tokens + my $hash = substr(sha1($token), -5); + $tokens{$hash} = $token; + + # set the weight to be the lowest distance for that token + if (!(defined $weights{$hash}) + || ($weights{$hash} > $distance)) + { + $weights{$hash} = $distance; + } + } + + # return the keys == tokens ... + return (\%tokens, \%weights); +} + +sub _tokenize_line { + my $self = $_[0]; + my $tokprefix = $_[2]; + my $region = $_[3]; + + my @rettokens; + my $magic_re = $self->{store}->get_magic_re(); + my ($w1,$w2,$w3,$w4,$w5) = ('','','','',''); + + my @words = ($_[1] =~ + /([^\p{Z}\p{C}][\/!?#]?[-\p{L}\p{M}\p{N}]*(?:['"=;]|\/?>|:\/*)?)/g); + foreach my $token (@words) + { + next if ($token =~ /^[\.\,]+$/); # just punctuation + # $token =~ s/^[-'"\.,]+//; # trim non-alphanum chars at start or end + # $token =~ s/[-'"\.,]+$//; # so we don't get loads of '"foo' tokens + + # Skip false magic tokens + # TVD: we need to do a defined() check since SQL doesn't have magic + # tokens, so the SQL BayesStore returns undef. I really want a way + # of optimizing that out, but I haven't come up with anything yet. + # + next if ( defined $magic_re && $token =~ /$magic_re/ ); + + # are we in the body? If so, apply some body-specific breakouts + if ($region == 1 || $region == 2) { + if (0 && CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) { + push (@rettokens, $self->_tokenize_mail_addrs ($token)); + } + elsif (0 && CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) { + push (@rettokens, "UD:".$token); # the full token + my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) { + push (@rettokens, "UD:".$1); # UD = URL domain + } + } + } + + # decompose tokens? do this after shortening long tokens + if ($region == 1 || $region == 2) { + if (0 && DECOMPOSE_BODY_TOKENS) { + if ($token =~ /[^\w:\*]/) { + my $decompd = $token; # "Foo!" + $decompd =~ s/[^\w:\*]//gs; + push (@rettokens, $tokprefix.$decompd); # "Foo" + } + + if ($token =~ /[A-Z]/) { + my $decompd = $token; $decompd = lc $decompd; + push (@rettokens, $tokprefix.$decompd); # "foo!" + + if ($token =~ /[^\w:\*]/) { + $decompd =~ s/[^\w:\*]//gs; + push (@rettokens, $tokprefix.$decompd); # "foo" + } + } + } + } + + $w5 = $w4; + $w4 = $w3; + $w3 = $w2; + $w2 = $w1; + $w1 = $tokprefix.$token; + + # here's the OSB (orthogonal sparse bigrams) part. + # record intra-token distance for weighting, too + push (@rettokens, $w2.' '.$w1); + push (@rettokens, '1:'.$w3.' '.$w1); + push (@rettokens, '2:'.$w4.' '.$w1); + push (@rettokens, '3:'.$w5.' '.$w1); + } + + return @rettokens; +} + +sub _tokenize_headers { + my ($self, $msg) = @_; + + my %parsed; + + my %user_ignore; + $user_ignore{lc $_} = 1 for @{$self->{main}->{conf}->{bayes_ignore_headers}}; + + # get headers in array context + my @hdrs; + my @rcvdlines; + for ($msg->get_all_headers()) { + # first, keep a copy of Received headers, so we can strip down to last 2 + if (/^Received:/i) { + push(@rcvdlines, $_); + next; + } + # and now skip lines for headers we don't want (including all Received) + next if /^${IGNORED_HDRS}:/i; + next if IGNORE_MSGID_TOKENS && /^Message-ID:/i; + push(@hdrs, $_); + } + push(@hdrs, $msg->get_all_metadata()); + + # and re-add the last 2 received lines: usually a good source of + # spamware tokens and HELO names. + if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); } + if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); } + + for (@hdrs) { + next unless /\S/; + my ($hdr, $val) = split(/:/, $_, 2); + + # remove user-specified headers here, after Received, in case they + # want to ignore that too + next if exists $user_ignore{lc $hdr}; + + # Prep the header value + $val ||= ''; + chomp($val); + + # special tokenization for some headers: + if ($hdr =~ /^(?:|X-|Resent-)Message-Id$/i) { + $val = $self->_pre_chew_message_id ($val); + } + elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:|X-|Resent-) + (?:Return-Path|From|To|Cc|Reply-To|Errors-To|Mail-Followup-To|Sender)$/ix) + { + $val = $self->_pre_chew_addr_header ($val); + } + elsif ($hdr eq 'Received') { + $val = $self->_pre_chew_received ($val); + } + elsif ($hdr eq 'Content-Type') { + $val = $self->_pre_chew_content_type ($val); + } + elsif ($hdr eq 'MIME-Version') { + $val =~ s/1\.0//; # totally innocuous + } + elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) { + $val = "1"; # just mark the presence, they create lots of hapaxen + } + + if (MAP_HEADERS_MID) { + if ($hdr =~ /^(?:In-Reply-To|References|Message-ID)$/i) { + $parsed{"*MI"} = $val; + } + } + if (MAP_HEADERS_FROMTOCC) { + if ($hdr =~ /^(?:From|To|Cc)$/i) { + $parsed{"*Ad"} = $val; + } + } + if (MAP_HEADERS_USERAGENT) { + if ($hdr =~ /^(?:X-Mailer|User-Agent)$/i) { + $parsed{"*UA"} = $val; + } + } + + if (exists $parsed{$hdr}) { + $parsed{$hdr} .= " ".$val; + } else { + $parsed{$hdr} = $val; + } + if (would_log('dbg', 'osbf') > 1) { + dbg("osbf: header tokens for $hdr = \"$parsed{$hdr}\""); + } + } + + return %parsed; +} + +sub _pre_chew_content_type { + my ($self, $val) = @_; + + # hopefully this will retain good bits without too many hapaxen + if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) { + my $boundary = $1; + $boundary =~ s/[a-fA-F0-9]/H/gs; + # break up blocks of separator chars so they become their own tokens + $boundary =~ s/([-_\.=]+)/ $1 /gs; + $val .= $boundary; + } + + # stop-list words for Content-Type header: these wind up totally gray + $val =~ s/\b(?:text|charset)\b//; + + $val; +} + +sub _pre_chew_message_id { + my ($self, $val) = @_; + # we can (a) get rid of a lot of hapaxen and (b) increase the token + # specificity by pre-parsing some common formats. + + # Outlook Express format: + $val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$ + ([0-9a-f]{4})[0-9a-f]{4}\$ + ([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx; + + # Exim: + $val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//; + + # Sendmail: + $val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\. + [A-F0-9]{10,12}\@//gx; + + # try to split Message-ID segments on probable ID boundaries. Note that + # Outlook message-ids seem to contain a server identifier ID in the last + # 8 bytes before the @. Make sure this becomes its own token, it's a + # great spam-sign for a learning system! Be sure to split on ".". + $val =~ s/[^_A-Za-z0-9]/ /g; + $val; +} + +sub _pre_chew_received { + my ($self, $val) = @_; + + # Thanks to Dan for these. Trim out "useless" tokens; sendmail-ish IDs + # and valid-format RFC-822/2822 dates + + $val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs; # Sendmail + $val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs; # Sendmail + $val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs; # Sendmail + $val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim + + $val =~ s/(?:(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)? + [0-3\s]?[0-9]\s + (?:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s + (?:19|20)?[0-9]{2}\s + [0-2][0-9](?:\:[0-5][0-9]){1,2}\s + (?:\s*\(|\)|\s*(?:[+-][0-9]{4})|\s*(?:UT|[A-Z]{2,3}T))* + //gx; + + # IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for + # IPs in the 10 and 192.168 ranges, they gets lots of significant tokens + # (on both sides) + # also make a dup with the full IP, as fodder for + # bayes_dump_to_trusted_networks: "H*r:ip*aaa.bbb.ccc.ddd" + $val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{ + if ($2 eq '10' || ($2 eq '192' && $3 eq '168')) { + $1.$2.$3.$4. + " ip*".$1.$2.$3.$4." "; + } else { + $1.$2.$3. + " ip*".$1.$2.$3.$4." "; + } + }gex; + + # trim these: they turn out as the most common tokens, but with a + # prob of about .5. waste of space! + $val =~ s/\b(?:with|from|for|SMTP|ESMTP)\b/ /g; + + $val; +} + +sub _pre_chew_addr_header { + my ($self, $val) = @_; + local ($_); + + my @addrs = $self->{main}->find_all_addrs_in_line ($val); + my @toks; + foreach (@addrs) { + push (@toks, $self->_tokenize_mail_addrs ($_)); + } + return join (' ', @toks); +} + +sub _tokenize_mail_addrs { + my ($self, $addr) = @_; + + ($addr =~ /(.+)\@(.+)$/) or return (); + my @toks; + push(@toks, "U*".$1, "D*".$2); + $_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); } + return @toks; +} + + +########################################################################### + +our $distance_weights = [ 3125, 256, 27, 4 ]; + +# compute the probability that a token is spammish +sub _compute_prob_for_token { + my ($self, $token, $ns, $nn, $s, $n, $distance) = @_; + + # we allow the caller to give us the token information, just + # to save a potentially expensive lookup + if (!defined($s) || !defined($n)) { + ($s, $n, undef) = $self->{store}->tok_get ($token); + } + return if ($s == 0 && $n == 0); + + # we can't do anything if we haven't trained on any ham/spam + return if ( $ns == 0 || $nn == 0 ); + + # weighting for OSB sparse features + my $weight; + if (defined $distance) { + $weight = $distance_weights->[$distance]; + } + $weight ||= $distance_weights->[0]; + + # apply the EDDC algorithm, specifically the part from section 4 + # of the osbf-eddc.pdf paper + + # normalized count of docs containing feature in spam/ham + my $NDfs = ($s * 10000) / $ns; + my $NDfh = ($n * 10000) / $nn; + + # these values are as dictated in the EDDC paper, except for K3; + # normally that should be 8, but 1 allows strong tokens to reach + # 0.99 instead of 0.125, which makes more sense for us + my $K1 = 0.25; + my $K2 = 10; + my $K3 = 1; + + my $Sumf = $s + $n; die "assert: Sumf == 0" unless $Sumf; + my $WdotSumf = $weight * $Sumf; + + # the resulting confidence factor equation + my $CF = ((((($NDfs - $NDfh) ** 2) + ($NDfs * $NDfh) - ($K1 / $Sumf)) + / (($NDfs + $NDfh) ** 2)) ** $K2) + * ($WdotSumf / (1 + $K3 * $WdotSumf)); + + my $ratios = ($s / $ns); + my $ration = ($n / $nn); + my $prob; + + if ($ratios == 0 && $ration == 0) { + warn "osbf: oops? ratios == ration == 0"; + return; + } else { + $prob = ($ratios / ($ration + $ratios)); + $prob = ($prob - 0.5) * $CF + 0.5; # apply EDDC + } + + # warn "JMD $s/$ns $n/$nn cf=$CF p=$prob"; + + return $prob; +} + +########################################################################### +# If a token is neither hammy nor spammy, return 0. +# For a spammy token, return the minimum number of additional ham messages +# it would have had to appear in to no longer be spammy. Hammy tokens +# are handled similarly. That's what the function does (at the time +# of this writing, 31 July 2003, 16:02:55 CDT). It would be slightly +# more useful if it returned the number of /additional/ ham messages +# a spammy token would have to appear in to no longer be spammy but I +# fear that might require the solution to a cubic equation, and I +# just don't have the time for that now. + +sub compute_declassification_distance { + my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_; + return 0; + #TODO? probably not +} + +########################################################################### + +sub _opportunistic_calls { + my($self, $journal_only) = @_; + + # If we're not already tied, abort. + if (!$self->{store}->db_readable()) { + dbg("osbf: opportunistic call attempt failed, DB not readable"); + return; + } + + # Is an expire or sync running? + my $running_expire = $self->{store}->get_running_expire_tok(); + if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) { + dbg("osbf: opportunistic call attempt skipped, found fresh running expire magic token"); + return; + } + + # handle expiry and syncing + if (!$journal_only && $self->{store}->expiry_due()) { + dbg("osbf: opportunistic call found expiry due"); + + # sync will bring the DB R/W as necessary, and the expire will remove + # the running_expire token, may untie as well. + $self->{main}->{bayes_scanner}->sync(1,1); + } + elsif ( $self->{store}->sync_due() ) { + dbg("osbf: opportunistic call found journal sync due"); + + # sync will bring the DB R/W as necessary, may untie as well + $self->{main}->{bayes_scanner}->sync(1,0); + + # We can only remove the running_expire token if we're doing R/W + if ($self->{store}->db_writable()) { + $self->{store}->remove_running_expire_tok(); + } + } + + return; +} + +########################################################################### + +sub learner_new { + my ($self) = @_; + + if ($self->{conf}->{osbf_store_module}) { + my $module = $self->{conf}->{osbf_store_module}; + $module = untaint_var($module); # good enough? + my $store; + + eval ' + require '.$module.'; + $store = '.$module.'->new($self); + 1; + ' or do { + my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; + die "osbf: (in new) $eval_stat\n"; + }; + $self->{store} = $store; + } + else { + require Mail::SpamAssassin::OSBF::Store::DBM; + $self->{store} = Mail::SpamAssassin::OSBF::Store::DBM->new($self); + } + + $self; +} + +1; + +=back + +=cut Index: rules/v330.pre =================================================================== --- rules/v330.pre (.../trunk) (revision 0) +++ rules/v330.pre (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -0,0 +1,20 @@ +# This is the right place to customize your installation of SpamAssassin. +# +# See 'perldoc Mail::SpamAssassin::Conf' for details of what can be +# tweaked. +# +# This file was installed during the installation of SpamAssassin 3.3.0, +# and contains plugin loading commands for the new plugins added in that +# release. It will not be overwritten during future SpamAssassin installs, +# so you can modify it to enable some disabled-by-default plugins below, +# if you so wish. +# +# There are now multiple files read to enable plugins in the +# /etc/mail/spamassassin directory; previously only one, "init.pre" was +# read. Now both "init.pre", "v310.pre", and any other files ending in +# ".pre" will be read. As future releases are made, new plugins will be +# added to new files, named according to the release they're added in. +########################################################################### + +# OSBF - EXPERIMENTAL alternative to Bayes +# loadplugin Mail::SpamAssassin::Plugin::OSBF Index: masses/bayes-testing/bayes-10pcv-driver =================================================================== --- masses/bayes-testing/bayes-10pcv-driver (.../trunk) (revision 602913) +++ masses/bayes-testing/bayes-10pcv-driver (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -56,6 +56,7 @@ echo " bayes_path $tmpdir/dbs/bayes +osbf_path $tmpdir/dbs/osbf bayes_auto_learn 0 bayes_min_ham_num 10 bayes_min_spam_num 10 @@ -76,6 +77,10 @@ echo "Restoring full learned DBs..." ( cd $tmpdir; rm -rf dbs; tar xf learned-all.tar ) } +runcmd () { + echo "$*" + time $* +} if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then @@ -87,14 +92,14 @@ ( echo -n "Learning from all ham buckets..." ; date - time sa-learn --ham --randseed=1 --no-sync $learnargs \ + runcmd sa-learn --ham --randseed=1 --no-sync $learnargs \ --showdots --mbox --config-file=$tmpdir/rules $testdir/cor/ham/* echo -n "Learning from all spam buckets..." ; date - time sa-learn --spam --randseed=1 --no-sync $learnargs \ + runcmd sa-learn --spam --randseed=1 --no-sync $learnargs \ --showdots --mbox --config-file=$tmpdir/rules $testdir/cor/spam/* - time sa-learn --sync $learnargs --config-file=$tmpdir/rules + runcmd sa-learn --sync $learnargs --config-file=$tmpdir/rules echo -n "Done learning. " ; date ) 2>&1 | tee $results/learn.log @@ -140,30 +145,30 @@ if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then echo "Forgetting contents of test ham bucket..." - time sa-learn --forget --config-file=$tmpdir/rules --showdots \ + runcmd sa-learn --forget --config-file=$tmpdir/rules --showdots \ --mbox $rdir/hbuckettest echo "Forgetting contents of test spam bucket..." - time sa-learn --forget --config-file=$tmpdir/rules --showdots \ + runcmd sa-learn --forget --config-file=$tmpdir/rules --showdots \ --mbox $rdir/sbuckettest else echo "Learning contents of learn ham bucket..." - time sa-learn --ham --randseed=1 --no-sync $learnargs \ + runcmd sa-learn --ham --randseed=1 --no-sync $learnargs \ --showdots --mbox --config-file=$tmpdir/rules $rdir/hbucketlearn echo "Learning contents of learn spam bucket..." - time sa-learn --spam --randseed=1 --no-sync $learnargs \ + runcmd sa-learn --spam --randseed=1 --no-sync $learnargs \ --showdots --mbox --config-file=$tmpdir/rules $rdir/sbucketlearn - time sa-learn --sync $learnargs --config-file=$tmpdir/rules + runcmd sa-learn --sync $learnargs --config-file=$tmpdir/rules echo "Dumping bayes DB..." ( cd .. ; sa-learn --dump --dbpath=$tmpdir/dbs/bayes ) \ > $rdir/bayes_db.dump fi - time sa-learn --sync --config-file=$tmpdir/rules + runcmd sa-learn --sync --config-file=$tmpdir/rules # take a copy of the trained Bayes DBs, gzipped ( cd $tmpdir ; tar cf - dbs | gzip -c > $rdir/dbs.tgz ) @@ -198,12 +203,12 @@ else echo "Running mass-check on ham bucket..." - time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \ + runcmd ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \ --bayes --mbox $rdir/hbuckettest \ > $rdir/nonspam.log echo "Running mass-check on spam bucket..." - time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \ + runcmd ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \ --bayes --mbox $rdir/sbuckettest \ > $rdir/spam.log fi Index: rules/v330.pre =================================================================== --- rules/v330.pre (.../trunk) (revision 0) +++ rules/v330.pre (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -0,0 +1,20 @@ +# This is the right place to customize your installation of SpamAssassin. +# +# See 'perldoc Mail::SpamAssassin::Conf' for details of what can be +# tweaked. +# +# This file was installed during the installation of SpamAssassin 3.3.0, +# and contains plugin loading commands for the new plugins added in that +# release. It will not be overwritten during future SpamAssassin installs, +# so you can modify it to enable some disabled-by-default plugins below, +# if you so wish. +# +# There are now multiple files read to enable plugins in the +# /etc/mail/spamassassin directory; previously only one, "init.pre" was +# read. Now both "init.pre", "v310.pre", and any other files ending in +# ".pre" will be read. As future releases are made, new plugins will be +# added to new files, named according to the release they're added in. +########################################################################### + +# OSBF - EXPERIMENTAL alternative to Bayes +# loadplugin Mail::SpamAssassin::Plugin::OSBF Index: masses/bayes-testing/bayes-10pcv-driver =================================================================== --- masses/bayes-testing/bayes-10pcv-driver (.../trunk) (revision 602913) +++ masses/bayes-testing/bayes-10pcv-driver (.../branches/bug-5293-pluginized-bayes) (revision 602913) @@ -56,6 +56,7 @@ echo " bayes_path $tmpdir/dbs/bayes +osbf_path $tmpdir/dbs/osbf bayes_auto_learn 0 bayes_min_ham_num 10 bayes_min_spam_num 10 @@ -76,6 +77,10 @@ echo "Restoring full learned DBs..." ( cd $tmpdir; rm -rf dbs; tar xf learned-all.tar ) } +runcmd () { + echo "$*" + time $* +} if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then @@ -87,14 +92,14 @@ ( echo -n "Learning from all ham buckets..." ; date - time sa-learn --ham --randseed=1 --no-sync $learnargs \ + runcmd sa-learn --ham --randseed=1 --no-sync $learnargs \ --showdots --mbox --config-file=$tmpdir/rules $testdir/cor/ham/* echo -n "Learning from all spam buckets..." ; date - time sa-learn --spam --randseed=1 --no-sync $learnargs \ + runcmd sa-learn --spam --randseed=1 --no-sync $learnargs \ --showdots --mbox --config-file=$tmpdir/rules $testdir/cor/spam/* - time sa-learn --sync $learnargs --config-file=$tmpdir/rules + runcmd sa-learn --sync $learnargs --config-file=$tmpdir/rules echo -n "Done learning. " ; date ) 2>&1 | tee $results/learn.log @@ -140,30 +145,30 @@ if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then echo "Forgetting contents of test ham bucket..." - time sa-learn --forget --config-file=$tmpdir/rules --showdots \ + runcmd sa-learn --forget --config-file=$tmpdir/rules --showdots \ --mbox $rdir/hbuckettest echo "Forgetting contents of test spam bucket..." - time sa-learn --forget --config-file=$tmpdir/rules --showdots \ + runcmd sa-learn --forget --config-file=$tmpdir/rules --showdots \ --mbox $rdir/sbuckettest else echo "Learning contents of learn ham bucket..." - time sa-learn --ham --randseed=1 --no-sync $learnargs \ + runcmd sa-learn --ham --randseed=1 --no-sync $learnargs \ --showdots --mbox --config-file=$tmpdir/rules $rdir/hbucketlearn echo "Learning contents of learn spam bucket..." - time sa-learn --spam --randseed=1 --no-sync $learnargs \ + runcmd sa-learn --spam --randseed=1 --no-sync $learnargs \ --showdots --mbox --config-file=$tmpdir/rules $rdir/sbucketlearn - time sa-learn --sync $learnargs --config-file=$tmpdir/rules + runcmd sa-learn --sync $learnargs --config-file=$tmpdir/rules echo "Dumping bayes DB..." ( cd .. ; sa-learn --dump --dbpath=$tmpdir/dbs/bayes ) \ > $rdir/bayes_db.dump fi - time sa-learn --sync --config-file=$tmpdir/rules + runcmd sa-learn --sync --config-file=$tmpdir/rules # take a copy of the trained Bayes DBs, gzipped ( cd $tmpdir ; tar cf - dbs | gzip -c > $rdir/dbs.tgz ) @@ -198,12 +203,12 @@ else echo "Running mass-check on ham bucket..." - time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \ + runcmd ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \ --bayes --mbox $rdir/hbuckettest \ > $rdir/nonspam.log echo "Running mass-check on spam bucket..." - time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \ + runcmd ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \ --bayes --mbox $rdir/sbuckettest \ > $rdir/spam.log fi