Index: lib/Mail/SpamAssassin.pm
===================================================================
--- lib/Mail/SpamAssassin.pm	(.../trunk)	(revision 602913)
+++ lib/Mail/SpamAssassin.pm	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -671,7 +671,7 @@
 
 sub finish_learner {
   my $self = shift;
-  $self->{bayes_scanner}->sanity_check_is_untied(1) if $self->{bayes_scanner};
+  $self->{bayes_scanner}->force_close(1) if $self->{bayes_scanner};
   1;
 }
 
@@ -1325,7 +1325,7 @@
   }
 
   # make sure things are ready for scanning
-  $self->{bayes_scanner}->sanity_check_is_untied() if $self->{bayes_scanner};
+  $self->{bayes_scanner}->force_close() if $self->{bayes_scanner};
   $self->call_plugins("compile_now_finish",
 		      { use_user_prefs => $use_user_prefs,
 			keep_userstate => $deal_with_userstate});
Index: lib/Mail/SpamAssassin/BayesStore/SQL.pm
===================================================================
--- lib/Mail/SpamAssassin/BayesStore/SQL.pm	(.../trunk)	(revision 602913)
+++ lib/Mail/SpamAssassin/BayesStore/SQL.pm	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -48,7 +48,7 @@
 
 =head2 new
 
-public class (Mail::SpamAssassin::BayesStore::SQL) new (Mail::Spamassassin::Bayes $bayes)
+public class (Mail::SpamAssassin::BayesStore::SQL) new (Mail::Spamassassin::Plugin::Bayes $bayes)
 
 Description:
 This methods creates a new instance of the Mail::SpamAssassin::BayesStore::SQL
@@ -648,7 +648,7 @@
   }  
 
   while (my ($token, $spam_count, $ham_count, $atime) = $sth->fetchrow_array()) {
-    my $prob = $self->{bayes}->compute_prob_for_token($token, $vars[1], $vars[2],
+    my $prob = $self->{bayes}->_compute_prob_for_token($token, $vars[1], $vars[2],
 						      $spam_count, $ham_count);
     $prob ||= 0.5;
 
Index: lib/Mail/SpamAssassin/BayesStore/DBM.pm
===================================================================
--- lib/Mail/SpamAssassin/BayesStore/DBM.pm	(.../trunk)	(revision 602913)
+++ lib/Mail/SpamAssassin/BayesStore/DBM.pm	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -980,7 +980,7 @@
     # We have the value already, so just unpack it.
     my ($ts, $th, $atime) = $self->tok_unpack ($tokvalue);
     
-    my $prob = $self->{bayes}->compute_prob_for_token($tok, $vars[1], $vars[2], $ts, $th);
+    my $prob = $self->{bayes}->_compute_prob_for_token($tok, $vars[1], $vars[2], $ts, $th);
     $prob ||= 0.5;
     
     my $encoded_tok = unpack("H*",$tok);
Index: lib/Mail/SpamAssassin/Plugin.pm
===================================================================
--- lib/Mail/SpamAssassin/Plugin.pm	(.../trunk)	(revision 602913)
+++ lib/Mail/SpamAssassin/Plugin.pm	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -837,6 +837,106 @@
   %{$self} = ();
 }
 
+=item $plugin->learner_new ()
+
+Used to support human-trained probabilistic classifiers like the BAYES_* ruleset.
+Called when a new C<Mail::SpamAssassin::Bayes> object has been created; typically
+when a new user's scan is about to start.
+
+=item $plugin->learn_message ()
+
+Train the classifier with a training message.
+
+=over 4
+
+=item isspam
+
+1 if the message is spam, 0 if it's non-spam.
+
+=item msg
+
+The message's C<Mail::SpamAssassin::Message> object.
+
+=item id
+
+An optional message-identification string, used internally to tag the message.
+If it is C<undef>, one will be generated.  It should be unique to that message.
+
+=back
+
+=item $plugin->forget_message ()
+
+Tell the classifier to 'forget' its training about a specific message.
+
+=over 4
+
+=item msg
+
+The message's C<Mail::SpamAssassin::Message> object.
+
+=item id
+
+An optional message-identification string, used internally to tag the message.
+If it is C<undef>, one will be generated.  It should be unique to that message.
+
+=back
+
+=item $plugin->learner_sync ()
+
+Tell the classifier to 'sync' any pending changes against the current 
+user's training database.  This is called by C<sa-learn --sync>.
+
+If you do not need to implement these for your classifier, create an
+implementation that just contains C<return 1>.
+
+=item $plugin->learner_expire_old_training ()
+
+Tell the classifier to perform infrequent, time-consuming cleanup of
+the current user's training database.  This is called by C<sa-learn
+--force-expire>.
+
+If you do not need to implement these for your classifier, create an
+implementation that just contains C<return 1>.
+
+=item $plugin->learner_is_scan_available ()
+
+Should return 1 if it is possible to use the current user's training data for
+a message-scan operation, or 0 otherwise.
+
+=item $plugin->learner_dump_database ()
+
+Dump information about the current user's training data to C<stdout>.
+This is called by C<sa-learn --dump>.
+
+=over 4
+
+=item magic
+
+Set to 1 if "magic" name-value metadata should be dumped.
+
+=item toks
+
+Set to 1 if the database of tokens should be dumped.
+
+=item regex
+
+Either C<undef> to dump all tokens, or a value which specifies a regular expression
+subset of the tokens to dump.
+
+=back
+
+=item $plugin->learner_close ()
+
+Close any open databases.
+
+=over 4
+
+=item quiet
+
+Set to 1 if warning messages should be suppressed.
+
+=back
+
 =head1 HELPER APIS
 
 These methods provide an API for plugins to register themselves
Index: lib/Mail/SpamAssassin/PerMsgStatus.pm
===================================================================
--- lib/Mail/SpamAssassin/PerMsgStatus.pm	(.../trunk)	(revision 602913)
+++ lib/Mail/SpamAssassin/PerMsgStatus.pm	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -244,7 +244,7 @@
     $self->{main}->finish_learner();        # for now
 
     if (exists $self->{main}->{bayes_scanner}) {
-      $self->{main}->{bayes_scanner}->sanity_check_is_untied();
+      $self->{main}->{bayes_scanner}->force_close();
     }
     1;
   } or do {
@@ -988,52 +988,6 @@
   return $text;
 }
 
-sub bayes_report_make_list {
-  my $self = shift;
-  my $info = shift;
-  my $param = shift || "5";
-  my ($limit,$fmt_arg,$more) = split /,/, $param;
-
-  return "Tokens not available." unless defined $info;
-
-  my %formats = (
-      short => '$t',
-      Short => 'Token: \"$t\"',
-      compact => '$p-$D--$t',
-      Compact => 'Probability $p -declassification distance $D (\"+\" means > 9) --token: \"$t\"',
-      medium => '$p-$D-$N--$t',
-      long => '$p-$d--${h}h-${s}s--${a}d--$t',
-      Long => 'Probability $p -declassification distance $D --in ${h} ham messages -and ${s} spam messages --$a} days old--token:\"$t\"'
-    );
-
-  my $raw_fmt = (!$fmt_arg ? '$p-$D--$t' : $formats{$fmt_arg});
-
-  return "Invalid format, must be one of: ".join(",",keys %formats)
-    unless defined $raw_fmt;
-
-  my $fmt = '"'.$raw_fmt.'"';
-  my $amt = $limit < @$info ? $limit : @$info;
-  return "" unless $amt;
-
-  my $Bayes = $self->{main}{bayes_scanner};
-  return "Bayes not available" unless defined $Bayes;
-  my $ns = $self->{bayes_nspam};
-  my $nh = $self->{bayes_nham};
-  my $digit = sub { $_[0] > 9 ? "+" : $_[0] };
-  my $now = time;
-
-  join ', ', map {
-    my($t,$prob,$s,$h,$u) = @$_;
-    my $a = int(($now - $u)/(3600 * 24));
-    my $d = $Bayes->compute_declassification_distance($ns,$nh,$s,$h,$prob);
-    my $p = sprintf "%.3f", $prob;
-    my $n = $s + $h;
-    my ($c,$o) = $prob < 0.5 ? ($h,$s) : ($s,$h);
-    my ($D,$S,$H,$C,$O,$N) = map &$digit($_), ($d,$s,$h,$c,$o,$n);
-    eval $fmt;  ## no critic
-  } @{$info}[0..$amt-1];
-}
-
 ###########################################################################
 
 # public API for plugins
@@ -1216,33 +1170,6 @@
                         sprintf("%3.4f", $self->{bayes_score}) : "0.5"
             },
 
-            HAMMYTOKENS => sub {
-              $self->bayes_report_make_list
-                ( $self->{bayes_token_info_hammy}, shift );
-            },
-
-            SPAMMYTOKENS => sub {
-              $self->bayes_report_make_list
-                ( $self->{bayes_token_info_spammy}, shift );
-            },
-
-            TOKENSUMMARY => sub {
-              if( defined $self->{tag_data}{BAYESTC} )
-                {
-                  my $tcount_neutral = $self->{tag_data}{BAYESTCLEARNED}
-                    - $self->{tag_data}{BAYESTCSPAMMY}
-                    - $self->{tag_data}{BAYESTCHAMMY};
-                  my $tcount_new = $self->{tag_data}{BAYESTC}
-                    - $self->{tag_data}{BAYESTCLEARNED};
-                  "Tokens: new, $tcount_new; "
-                    ."hammy, $self->{tag_data}{BAYESTCHAMMY}; "
-                    ."neutral, $tcount_neutral; "
-                    ."spammy, $self->{tag_data}{BAYESTCSPAMMY}."
-                } else {
-                  "Bayes not run.";
-                }
-            },
-
             DATE => \&Mail::SpamAssassin::Util::time_to_rfc822_date,
 
             STARS => sub {
Index: lib/Mail/SpamAssassin/PerMsgLearner.pm
===================================================================
--- lib/Mail/SpamAssassin/PerMsgLearner.pm	(.../trunk)	(revision 602913)
+++ lib/Mail/SpamAssassin/PerMsgLearner.pm	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -90,8 +90,8 @@
 # Learn the message as spam.
 # 
 # C<$id> is an optional message-identification string, used internally
-# to tag the message.  If it is C<undef>, the Message-Id of the message
-# will be used.  It should be unique to that message.
+# to tag the message.  If it is C<undef>, one will be generated.
+# It should be unique to that message.
 # 
 # This is a semi-private API; callers should use
 # C<$spamtest-E<gt>learn($mail,$id,$isspam,$forget)> instead.
@@ -117,8 +117,8 @@
 # Learn the message as ham.
 # 
 # C<$id> is an optional message-identification string, used internally
-# to tag the message.  If it is C<undef>, the Message-Id of the message
-# will be used.  It should be unique to that message.
+# to tag the message.  If it is C<undef>, one will be generated.
+# It should be unique to that message.
 # 
 # This is a semi-private API; callers should use
 # C<$spamtest-E<gt>learn($mail,$id,$isspam,$forget)> instead.
@@ -141,8 +141,8 @@
 # Forget about a previously-learned message.
 # 
 # C<$id> is an optional message-identification string, used internally
-# to tag the message.  If it is C<undef>, the Message-Id of the message
-# will be used.  It should be unique to that message.
+# to tag the message.  If it is C<undef>, one will be generated.
+# It should be unique to that message.
 # 
 # This is a semi-private API; callers should use
 # C<$spamtest-E<gt>learn($mail,$id,$isspam,$forget)> instead.
Index: lib/Mail/SpamAssassin/BayesStore.pm
===================================================================
--- lib/Mail/SpamAssassin/BayesStore.pm	(.../trunk)	(revision 602913)
+++ lib/Mail/SpamAssassin/BayesStore.pm	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -17,12 +17,12 @@
 
 =head1 NAME
 
-Mail::SpamAssassin::BayesStore - Bayesian Storage Module
+Mail::SpamAssassin::BayesStore - Storage Module for default Bayes classifier
 
 =head1 DESCRIPTION
 
 This is the public API for the Bayesian store methods.  Any implementation of
-the storage module must implement these methods.
+the storage module for the default Bayes classifier must implement these methods.
 
 =cut
 
@@ -43,12 +43,12 @@
 
 =item new
 
-public class (Mail::SpamAssassin::BayesStore) new (Mail::SpamAssassin::Bayes $bayes)
+public class (Mail::SpamAssassin::BayesStore) new (Mail::SpamAssassin::Plugin::Bayes $bayes)
 
 Description:
 This method creates a new instance of the Mail::SpamAssassin::BayesStore
-object.  You must pass in an instance of the Mail::SpamAssassin:Bayes object,
-which is stashed for use throughout the module.
+object.  You must pass in an instance of the Mail::SpamAssassin::Plugin::Bayes
+object, which is stashed for use throughout the module.
 
 =cut
 
@@ -89,7 +89,7 @@
 
 Description:
 This method reads any needed config variables from the configuration
-object and then calls the Mail::SpamAssassin::Bayes read_db_configs method.
+object and then calls the Mail::SpamAssassin::Plugin::Plugin::Bayes read_db_configs method.
 
 =cut
 
Index: lib/Mail/SpamAssassin/Plugin/WLBLEval.pm
===================================================================
--- lib/Mail/SpamAssassin/Plugin/WLBLEval.pm	(.../trunk)	(revision 602913)
+++ lib/Mail/SpamAssassin/Plugin/WLBLEval.pm	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -121,6 +121,8 @@
   return 0;
 }
 
+# TODO: this should be moved to a utility module off PerMsgStatus,
+# rather than a plugin API; it's used in Bayes.pm as a utility
 sub check_wb_list {
   my ($self, $params) = @_;
 
Index: lib/Mail/SpamAssassin/Plugin/Bayes.pm
===================================================================
--- lib/Mail/SpamAssassin/Plugin/Bayes.pm	(.../trunk)	(revision 602913)
+++ lib/Mail/SpamAssassin/Plugin/Bayes.pm	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -15,42 +15,256 @@
 # limitations under the License.
 # </@LICENSE>
 
+=head1 NAME
+
+Mail::SpamAssassin::Plugin::Bayes - determine spammishness using a Bayesian classifier
+
+=head1 DESCRIPTION
+
+This is a Bayesian-style probabilistic classifier, using an algorithm based on
+the one detailed in Paul Graham's I<A Plan For Spam> paper at:
+
+  http://www.paulgraham.com/spam.html
+
+It also incorporates some other aspects taken from Graham Robinson's webpage
+on the subject at:
+
+  http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
+
+And the chi-square probability combiner as described here:
+
+  http://www.linuxjournal.com/print.php?sid=6467
+
+The results are incorporated into SpamAssassin as the BAYES_* rules.
+
+=head1 METHODS
+
+=over 4
+
+=cut
+
 package Mail::SpamAssassin::Plugin::Bayes;
 
-use Mail::SpamAssassin::Plugin;
 use strict;
 use warnings;
 use bytes;
 use re 'taint';
 
-use vars qw(@ISA);
-@ISA = qw(Mail::SpamAssassin::Plugin);
+use Mail::SpamAssassin;
+use Mail::SpamAssassin::Plugin;
+use Mail::SpamAssassin::PerMsgStatus;
+use Mail::SpamAssassin::Logger;
+use Mail::SpamAssassin::Util qw(untaint_var);
 
-# constructor: register the eval rule
+# pick ONLY ONE of these combining implementations.
+use Mail::SpamAssassin::Bayes::CombineChi;
+# use Mail::SpamAssassin::Bayes::CombineNaiveBayes;
+
+use Digest::SHA1 qw(sha1 sha1_hex);
+
+our @ISA = qw(Mail::SpamAssassin::Plugin);
+
+use vars qw{
+  $IGNORED_HDRS
+  $MARK_PRESENCE_ONLY_HDRS
+  %HEADER_NAME_COMPRESSION
+  $OPPORTUNISTIC_LOCK_VALID
+};
+
+# Which headers should we scan for tokens?  Don't use all of them, as it's easy
+# to pick up spurious clues from some.  What we now do is use all of them
+# *less* these well-known headers; that way we can pick up spammers' tracking
+# headers (which are obviously not well-known in advance!).
+
+# Received is handled specially
+$IGNORED_HDRS = qr{(?: (?:X-)?Sender    # misc noise
+  |Delivered-To |Delivery-Date
+  |(?:X-)?Envelope-To
+  |X-MIME-Auto[Cc]onverted |X-Converted-To-Plain-Text
+
+  |Subject      # not worth a tiny gain vs. to db size increase
+
+  # Date: can provide invalid cues if your spam corpus is
+  # older/newer than ham
+  |Date
+
+  # List headers: ignore. a spamfiltering mailing list will
+  # become a nonspam sign.
+  |X-List|(?:X-)?Mailing-List
+  |(?:X-)?List-(?:Archive|Help|Id|Owner|Post|Subscribe
+    |Unsubscribe|Host|Id|Manager|Admin|Comment
+    |Name|Url)
+  |X-Unsub(?:scribe)?
+  |X-Mailman-Version |X-Been[Tt]here |X-Loop
+  |Mail-Followup-To
+  |X-eGroups-(?:Return|From)
+  |X-MDMailing-List
+  |X-XEmacs-List
+
+  # gatewayed through mailing list (thanks to Allen Smith)
+  |(?:X-)?Resent-(?:From|To|Date)
+  |(?:X-)?Original-(?:From|To|Date)
+
+  # Spamfilter/virus-scanner headers: too easy to chain from
+  # these
+  |X-MailScanner(?:-SpamCheck)?
+  |X-Spam(?:-(?:Status|Level|Flag|Report|Hits|Score|Checker-Version))?
+  |X-Antispam |X-RBL-Warning |X-Mailscanner
+  |X-MDaemon-Deliver-To |X-Virus-Scanned
+  |X-Mass-Check-Id
+  |X-Pyzor |X-DCC-\S{2,25}-Metrics
+  |X-Filtered-B[Yy] |X-Scanned-By |X-Scanner
+  |X-AP-Spam-(?:Score|Status) |X-RIPE-Spam-Status
+  |X-SpamCop-[^:]+
+  |X-SMTPD |(?:X-)?Spam-Apparently-To
+  |SPAM |X-Perlmx-Spam
+  |X-Bogosity
+
+  # some noisy Outlook headers that add no good clues:
+  |Content-Class |Thread-(?:Index|Topic)
+  |X-Original[Aa]rrival[Tt]ime
+
+  # Annotations from IMAP, POP, and MH:
+  |(?:X-)?Status |X-Flags |Replied |Forwarded
+  |Lines |Content-Length
+  |X-UIDL? |X-IMAPbase
+
+  # Annotations from Bugzilla
+  |X-Bugzilla-[^:]+
+
+  # Annotations from VM: (thanks to Allen Smith)
+  |X-VM-(?:Bookmark|(?:POP|IMAP)-Retrieved|Labels|Last-Modified
+    |Summary-Format|VHeader|v\d-Data|Message-Order)
+
+  # Annotations from Gnus:
+  | X-Gnus-Mail-Source
+  | Xref
+
+)}x;
+
+# Note only the presence of these headers, in order to reduce the
+# hapaxen they generate.
+$MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face
+  |X-(?:Gnu-?PG|PGP|GPG)(?:-Key)?-Fingerprint
+)}ix;
+
+# tweaks tested as of Nov 18 2002 by jm: see SpamAssassin-devel list archives
+# for results.  The winners are now the default settings.
+use constant IGNORE_TITLE_CASE => 1;
+use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 1;
+use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
+
+# tweaks of May 12 2003, see SpamAssassin-devel archives again.
+use constant PRE_CHEW_ADDR_HEADERS => 1;
+use constant CHEW_BODY_URIS => 1;
+use constant CHEW_BODY_MAILADDRS => 1;
+use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
+use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
+use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0;
+use constant IGNORE_MSGID_TOKENS => 0;
+
+# tweaks of 12 March 2004, see bug 2129.
+use constant DECOMPOSE_BODY_TOKENS => 1;
+use constant MAP_HEADERS_MID => 1;
+use constant MAP_HEADERS_FROMTOCC => 1;
+use constant MAP_HEADERS_USERAGENT => 1;
+
+# tweaks, see http://issues.apache.org/SpamAssassin/show_bug.cgi?id=3173#c26
+use constant ADD_INVIZ_TOKENS_I_PREFIX => 1;
+use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0;
+
+# We store header-mined tokens in the db with a "HHeaderName:val" format.
+# some headers may contain lots of gibberish tokens, so allow a little basic
+# compression by mapping the header name at least here.  these are the headers
+# which appear with the most frequency in my db.  note: this doesn't have to
+# be 2-way (ie. LHSes that map to the same RHS are not a problem), but mixing
+# tokens from multiple different headers may impact accuracy, so might as well
+# avoid this if possible. These are the top ones from my corpus, BTW (jm).
+%HEADER_NAME_COMPRESSION = (
+  'Message-Id'		=> '*m',
+  'Message-ID'		=> '*M',
+  'Received'		=> '*r',
+  'User-Agent'		=> '*u',
+  'References'		=> '*f',
+  'In-Reply-To'		=> '*i',
+  'From'		=> '*F',
+  'Reply-To'		=> '*R',
+  'Return-Path'		=> '*p',
+  'Return-path'		=> '*rp',
+  'X-Mailer'		=> '*x',
+  'X-Authentication-Warning' => '*a',
+  'Organization'	=> '*o',
+  'Organisation'        => '*o',
+  'Content-Type'	=> '*c',
+  'X-Spam-Relays-Trusted' => '*RT',
+  'X-Spam-Relays-Untrusted' => '*RU',
+);
+
+# How many seconds should the opportunistic_expire lock be valid?
+$OPPORTUNISTIC_LOCK_VALID = 300;
+
+# Should we use the Robinson f(w) equation from
+# http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html ?
+# It gives better results, in that scores are more likely to distribute
+# into the <0.5 range for nonspam and >0.5 for spam.
+use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1;
+
+# How many of the most significant tokens should we use for the p(w)
+# calculation?
+use constant N_SIGNIFICANT_TOKENS => 150;
+
+# How many significant tokens are required for a classifier score to
+# be considered usable?
+use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1;
+
+# How long a token should we hold onto?  (note: German speakers typically
+# will require a longer token than English ones.)
+use constant MAX_TOKEN_LENGTH => 15;
+
+###########################################################################
+
 sub new {
   my $class = shift;
-  my $mailsaobject = shift;
+  my ($main) = @_;
 
-  # some boilerplate...
   $class = ref($class) || $class;
-  my $self = $class->SUPER::new($mailsaobject);
+  my $self = $class->SUPER::new($main);
   bless ($self, $class);
 
-  # the important bit!
+  $self->{main} = $main;
+  $self->{conf} = $main->{conf};
+  $self->{use_ignores} = 1;
+
   $self->register_eval_rule("check_bayes");
+  $self;
+}
 
-  return $self;
+sub finish {
+  my $self = shift;
+  if ($self->{store}) {
+    $self->{store}->untie_db();
+  }
+  %{$self} = ();
 }
 
+# Plugin hook.
+# Return this implementation object, for callers that need to know
+# it.  TODO: callers shouldn't *need* to know it! 
+# used only in test suite to get access to {store}, internal APIs.
+#
+sub learner_get_implementation { return shift; }
+
+###########################################################################
+
 sub check_bayes {
   my ($self, $pms, $fulltext, $min, $max) = @_;
 
+  return 0 if (!$pms->{conf}->{use_learner});
   return 0 if (!$pms->{conf}->{use_bayes} || !$pms->{conf}->{use_bayes_rules});
 
   if (!exists ($pms->{bayes_score})) {
     my $timer = $self->{main}->time_method("check_bayes");
-    $pms->{bayes_score} =
-      $self->{main}->{bayes_scanner}->scan ($pms, $pms->{msg});
+    $pms->{bayes_score} = $self->scan($pms, $pms->{msg});
   }
 
   if (defined $pms->{bayes_score} &&
@@ -71,4 +285,1264 @@
   return 0;
 }
 
+###########################################################################
+
+# Plugin hook.
+sub learner_close {
+  my ($self, $params) = @_;
+  my $quiet = $params->{quiet};
+
+  # do a sanity check here.  Wierd things happen if we remain tied
+  # after compiling; for example, spamd will never see that the
+  # number of messages has reached the bayes-scanning threshold.
+  if ($self->{store}->db_readable()) {
+    warn "bayes: oops! still tied to bayes DBs, untying\n" unless $quiet;
+    $self->{store}->untie_db();
+  }
+}
+
+###########################################################################
+
+# read configuration items to control bayes behaviour.  Called by
+# BayesStore::read_db_configs().
+sub read_db_configs {
+  my ($self) = @_;
+
+  # use of hapaxes.  Set on bayes object, since it controls prob
+  # computation.
+  $self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes};
+}
+###########################################################################
+
+sub ignore_message {
+  my ($self,$PMS) = @_;
+
+  return 0 unless $self->{use_ignores};
+
+  my $ig_from = $self->{main}->call_plugins ("check_wb_list",
+        { permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' });
+  my $ig_to = $self->{main}->call_plugins ("check_wb_list",
+        { permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' });
+
+  my $ignore = $ig_from || $ig_to;
+
+  dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore;
+
+  return $ignore;
+}
+
+###########################################################################
+
+# Plugin hook.
+sub learn_message {
+  my ($self, $params) = @_;
+  my $isspam = $params->{isspam};
+  my $msg = $params->{msg};
+  my $id = $params->{id};
+
+  if (!$self->{conf}->{use_bayes}) { return; }
+
+  my $msgdata = $self->get_body_from_msg ($msg);
+  my $ret;
+
+  eval {
+    local $SIG{'__DIE__'};	# do not run user die() traps in here
+
+    my $ok;
+    if ($self->{main}->{learn_to_journal}) {
+      # If we're going to learn to journal, we'll try going r/o first...
+      # If that fails for some reason, let's try going r/w.  This happens
+      # if the DB doesn't exist yet.
+      $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();
+    } else {
+      $ok = $self->{store}->tie_db_writable();
+    }
+
+    if ($ok) {
+      $ret = $self->_learn_trapped ($isspam, $msg, $msgdata, $id);
+
+      if (!$self->{main}->{learn_caller_will_untie}) {
+        $self->{store}->untie_db();
+      }
+    }
+    1;
+  } or do {		# if we died, untie the dbs.
+    my $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat;
+    $self->{store}->untie_db();
+    die "bayes: (in learn) $eval_stat\n";
+  };
+
+  return $ret;
+}
+
+# this function is trapped by the wrapper above
+sub _learn_trapped {
+  my ($self, $isspam, $msg, $msgdata, $msgid) = @_;
+  my @msgid = ( $msgid );
+
+  if (!defined $msgid) {
+    @msgid = $self->get_msgid($msg);
+  }
+
+  foreach $msgid ( @msgid ) {
+    my $seen = $self->{store}->seen_get ($msgid);
+
+    if (defined ($seen)) {
+      if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) {
+        dbg("bayes: $msgid already learnt correctly, not learning twice");
+        return 0;
+      } elsif ($seen !~ /^[hs]$/) {
+        warn("bayes: db_seen corrupt: value='$seen' for $msgid, ignored");
+      } else {
+        # bug 3704: If the message was already learned, don't try learning it again.
+        # this prevents, for instance, manually learning as spam, then autolearning
+        # as ham, or visa versa.
+        if ($self->{main}->{learn_no_relearn}) {
+	  dbg("bayes: $msgid already learnt as opposite, not re-learning");
+	  return 0;
+	}
+
+        dbg("bayes: $msgid already learnt as opposite, forgetting first");
+
+        # kluge so that forget() won't untie the db on us ...
+        my $orig = $self->{main}->{learn_caller_will_untie};
+        $self->{main}->{learn_caller_will_untie} = 1;
+
+        my $fatal = !defined $self->{main}->{bayes_scanner}->forget ($msg);
+
+        # reset the value post-forget() ...
+        $self->{main}->{learn_caller_will_untie} = $orig;
+    
+        # forget() gave us a fatal error, so propagate that up
+        if ($fatal) {
+          dbg("bayes: forget() returned a fatal error, so learn() will too");
+	  return;
+        }
+      }
+
+      # we're only going to have seen this once, so stop if it's been
+      # seen already
+      last;
+    }
+  }
+
+  # Now that we're sure we haven't seen this message before ...
+  $msgid = $msgid[0];
+
+  if ($isspam) {
+    $self->{store}->nspam_nham_change (1, 0);
+  } else {
+    $self->{store}->nspam_nham_change (0, 1);
+  }
+
+  my $msgatime = $msg->receive_date();
+
+  # If the message atime comes back as being more than 1 day in the
+  # future, something's messed up and we should revert to current time as
+  # a safety measure.
+  #
+  $msgatime = time if ( $msgatime - time > 86400 );
+
+  my $tokens = $self->tokenize($msg, $msgdata);
+
+  if ($isspam) {
+    $self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime);
+  } else {
+    $self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime);
+  }
+
+  $self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h'));
+  $self->{store}->cleanup();
+
+  $self->{main}->call_plugins("bayes_learn", { toksref => $tokens,
+					       isspam => $isspam,
+					       msgid => $msgid,
+					       msgatime => $msgatime,
+					     });
+
+  dbg("bayes: learned '$msgid', atime: $msgatime");
+
+  1;
+}
+
+###########################################################################
+
+# Plugin hook.
+sub forget_message {
+  my ($self, $params) = @_;
+  my $msg = $params->{msg};
+  my $id = $params->{id};
+
+  if (!$self->{conf}->{use_bayes}) { return; }
+
+  my $msgdata = $self->get_body_from_msg ($msg);
+  my $ret;
+
+  # we still tie for writing here, since we write to the seen db
+  # synchronously
+  eval {
+    local $SIG{'__DIE__'};	# do not run user die() traps in here
+
+    my $ok;
+    if ($self->{main}->{learn_to_journal}) {
+      # If we're going to learn to journal, we'll try going r/o first...
+      # If that fails for some reason, let's try going r/w.  This happens
+      # if the DB doesn't exist yet.
+      $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();
+    } else {
+      $ok = $self->{store}->tie_db_writable();
+    }
+
+    if ($ok) {
+      $ret = $self->_forget_trapped ($msg, $msgdata, $id);
+
+      if (!$self->{main}->{learn_caller_will_untie}) {
+        $self->{store}->untie_db();
+      }
+    }
+    1;
+  } or do {		# if we died, untie the dbs.
+    my $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat;
+    $self->{store}->untie_db();
+    die "bayes: (in forget) $eval_stat\n";
+  };
+
+  return $ret;
+}
+
+# this function is trapped by the wrapper above
+sub _forget_trapped {
+  my ($self, $msg, $msgdata, $msgid) = @_;
+  my @msgid = ( $msgid );
+  my $isspam;
+
+  if (!defined $msgid) {
+    @msgid = $self->get_msgid($msg);
+  }
+
+  while( $msgid = shift @msgid ) {
+    my $seen = $self->{store}->seen_get ($msgid);
+
+    if (defined ($seen)) {
+      if ($seen eq 's') {
+        $isspam = 1;
+      } elsif ($seen eq 'h') {
+        $isspam = 0;
+      } else {
+        dbg("bayes: forget: msgid $msgid seen entry is neither ham nor spam, ignored");
+        return 0;
+      }
+
+      # messages should only be learned once, so stop if we find a msgid
+      # which was seen before
+      last;
+    }
+    else {
+      dbg("bayes: forget: msgid $msgid not learnt, ignored");
+    }
+  }
+
+  # This message wasn't learnt before, so return
+  if (!defined $isspam) {
+    dbg("bayes: forget: no msgid from this message has been learnt, skipping message");
+    return 0;
+  }
+  elsif ($isspam) {
+    $self->{store}->nspam_nham_change (-1, 0);
+  }
+  else {
+    $self->{store}->nspam_nham_change (0, -1);
+  }
+
+  my $tokens = $self->tokenize($msg, $msgdata);
+
+  if ($isspam) {
+    $self->{store}->multi_tok_count_change (-1, 0, $tokens);
+  } else {
+    $self->{store}->multi_tok_count_change (0, -1, $tokens);
+  }
+
+  $self->{store}->seen_delete ($msgid);
+  $self->{store}->cleanup();
+
+  $self->{main}->call_plugins("bayes_forget", { toksref => $tokens,
+						isspam => $isspam,
+						msgid => $msgid,
+					      });
+
+  1;
+}
+
+###########################################################################
+
+# Plugin hook.
+sub learner_sync {
+  my ($self, $params) = @_;
+  if (!$self->{conf}->{use_bayes}) { return 0; }
+  dbg("bayes: bayes journal sync starting");
+  $self->{store}->sync($params);
+  dbg("bayes: bayes journal sync completed");
+}
+
+###########################################################################
+
+# Plugin hook.
+sub learner_expire_old_training {
+  my ($self, $params) = @_;
+  if (!$self->{conf}->{use_bayes}) { return 0; }
+  dbg("bayes: expiry starting");
+  $self->{store}->expire_old_tokens($params);
+  dbg("bayes: expiry completed");
+}
+
+###########################################################################
+
+# Plugin hook.
+# Check to make sure we can tie() the DB, and we have enough entries to do a scan
+# if we're told the caller will untie(), go ahead and leave the db tied.
+sub learner_is_scan_available {
+  my ($self, $params) = @_;
+
+  return 0 unless $self->{conf}->{use_bayes};
+  return 0 unless $self->{store}->tie_db_readonly();
+
+  # We need the DB to stay tied, so if the journal sync occurs, don't untie!
+  my $caller_untie = $self->{main}->{learn_caller_will_untie};
+  $self->{main}->{learn_caller_will_untie} = 1;
+
+  # Do a journal sync if necessary.  Do this before the nspam_nham_get()
+  # call since the sync may cause an update in the number of messages
+  # learnt.
+  $self->_opportunistic_calls(1);
+
+  # Reset the variable appropriately
+  $self->{main}->{learn_caller_will_untie} = $caller_untie;
+
+  my ($ns, $nn) = $self->{store}->nspam_nham_get();
+
+  if ($ns < $self->{conf}->{bayes_min_spam_num}) {
+    dbg("bayes: not available for scanning, only $ns spam(s) in bayes DB < ".$self->{conf}->{bayes_min_spam_num});
+    if (!$self->{main}->{learn_caller_will_untie}) {
+      $self->{store}->untie_db();
+    }
+    return 0;
+  }
+  if ($nn < $self->{conf}->{bayes_min_ham_num}) {
+    dbg("bayes: not available for scanning, only $nn ham(s) in bayes DB < ".$self->{conf}->{bayes_min_ham_num});
+    if (!$self->{main}->{learn_caller_will_untie}) {
+      $self->{store}->untie_db();
+    }
+    return 0;
+  }
+
+  return 1;
+}
+
+###########################################################################
+
+sub scan {
+  my ($self, $permsgstatus, $msg) = @_;
+  my $score;
+
+  return unless $self->{conf}->{use_learner};
+
+  # When we're doing a scan, we'll guarantee that we'll do the untie,
+  # so override the global setting until we're done.
+  my $caller_untie = $self->{main}->{learn_caller_will_untie};
+  $self->{main}->{learn_caller_will_untie} = 1;
+
+  goto skip if ($self->{main}->{bayes_scanner}->ignore_message($permsgstatus));
+
+  goto skip unless $self->learner_is_scan_available();
+
+  my ($ns, $nn) = $self->{store}->nspam_nham_get();
+
+  ## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()
+  ## $self->{raw_counts} = " ns=$ns nn=$nn ";
+  ## }
+
+  dbg("bayes: corpus size: nspam = $ns, nham = $nn");
+
+  my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
+
+  my $msgtokens = $self->tokenize($msg, $msgdata);
+
+  my $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
+
+  my %pw;
+
+  foreach my $tokendata (@{$tokensdata}) {
+    my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
+    my $prob = $self->_compute_prob_for_token($token, $ns, $nn, $tok_spam, $tok_ham);
+    next unless defined $prob;
+
+    $pw{$token} = {
+      prob => $prob,
+      spam_count => $tok_spam,
+      ham_count => $tok_ham,
+      atime => $atime
+    };
+  }
+
+  # If none of the tokens were found in the DB, we're going to skip
+  # this message...
+  if (!keys %pw) {
+    dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database");
+    goto skip;
+  }
+
+  my $tcount_total = keys %{$msgtokens};
+  my $tcount_learned = keys %pw;
+
+  # Figure out the message receive time (used as atime below)
+  # If the message atime comes back as being in the future, something's
+  # messed up and we should revert to current time as a safety measure.
+  #
+  my $msgatime = $msg->receive_date();
+  my $now = time;
+  $msgatime = $now if ( $msgatime > $now );
+
+  # now take the $count most significant tokens and calculate probs using
+  # Robinson's formula.
+  my $count = N_SIGNIFICANT_TOKENS;
+  my @sorted;
+
+  my @touch_tokens;
+  my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];
+  my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];
+
+  my %tok_strength = map { $_ => (abs($pw{$_}->{prob} - 0.5)) } keys %pw;
+  my $log_each_token = (would_log('dbg', 'bayes') > 1);
+
+  foreach my $tok (sort {
+              $tok_strength{$b} <=> $tok_strength{$a}
+            } keys %pw)
+  {
+    if ($count-- < 0) { last; }
+    next if ($tok_strength{$tok} <
+                $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH);
+
+    my $pw = $pw{$tok}->{prob};
+
+    # What's more expensive, scanning headers for HAMMYTOKENS and
+    # SPAMMYTOKENS tags that aren't there or collecting data that
+    # won't be used?  Just collecting the data is certainly simpler.
+    #
+    my $raw_token = $msgtokens->{$tok} || "(unknown)";
+    my $s = $pw{$tok}->{spam_count};
+    my $n = $pw{$tok}->{ham_count};
+    my $a = $pw{$tok}->{atime};
+
+    if ($pw < 0.5) {
+      push @$tinfo_hammy,  [$raw_token,$pw,$s,$n,$a];
+    } else {
+      push @$tinfo_spammy, [$raw_token,$pw,$s,$n,$a];
+    }
+
+    push (@sorted, $pw);
+
+    # update the atime on this token, it proved useful
+    push(@touch_tokens, $tok);
+
+    if ($log_each_token) {
+      dbg("bayes: token '$raw_token' => $pw");
+    }
+  }
+
+  if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 && 
+	$#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE))
+  {
+    dbg("bayes: cannot use bayes on this message; not enough usable tokens found");
+    goto skip;
+  }
+
+  $score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted);
+
+  # Couldn't come up with a probability?
+  goto skip unless defined $score;
+
+  dbg("bayes: score = $score");
+
+  # no need to call tok_touch_all unless there were significant
+  # tokens and a score was returned
+  # we don't really care about the return value here
+  $self->{store}->tok_touch_all(\@touch_tokens, $msgatime);
+
+  $permsgstatus->{bayes_nspam} = $ns;
+  $permsgstatus->{bayes_nham} = $nn;
+
+  ## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()
+  ## print "#Bayes-Raw-Counts: $self->{raw_counts}\n";
+  ## }
+
+  $self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens,
+					      probsref => \%pw,
+					      score => $score,
+					      msgatime => $msgatime,
+					      significant_tokens => \@touch_tokens,
+					    });
+
+skip:
+  if (!defined $score) {
+    dbg("bayes: not scoring message, returning undef");
+  }
+
+  # Take any opportunistic actions we can take
+  if ($self->{main}->{opportunistic_expire_check_only}) {
+    # we're supposed to report on expiry only -- so do the
+    # _opportunistic_calls() run for the journal only.
+    $self->_opportunistic_calls(1);
+    $permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due();
+  }
+  else {
+    $self->_opportunistic_calls();
+  }
+
+  # Do any cleanup we need to do
+  $self->{store}->cleanup();
+
+  # Reset the value accordingly
+  $self->{main}->{learn_caller_will_untie} = $caller_untie;
+
+  # If our caller won't untie the db, we need to do it.
+  if (!$caller_untie) {
+    $self->{store}->untie_db();
+  }
+
+  $permsgstatus->set_tag ('BAYESTCHAMMY',
+                        ($tinfo_hammy ? scalar @{$tinfo_hammy} : 0));
+  $permsgstatus->set_tag ('BAYESTCSPAMMY',
+                        ($tinfo_spammy ? scalar @{$tinfo_spammy} : 0));
+  $permsgstatus->set_tag ('BAYESTCLEARNED', $tcount_learned);
+  $permsgstatus->set_tag ('BAYESTC', $tcount_total);
+
+  $permsgstatus->set_tag ('HAMMYTOKENS', sub {
+              $self->bayes_report_make_list
+                ($permsgstatus, $permsgstatus->{bayes_token_info_hammy}, shift);
+            });
+
+  $permsgstatus->set_tag ('SPAMMYTOKENS', sub {
+              $self->bayes_report_make_list
+                ($permsgstatus, $permsgstatus->{bayes_token_info_spammy}, shift);
+            });
+
+  $permsgstatus->set_tag ('TOKENSUMMARY', sub {
+              if( defined $self->{tag_data}{BAYESTC} )
+                {
+                  my $tcount_neutral = $permsgstatus->{tag_data}{BAYESTCLEARNED}
+                                    - $permsgstatus->{tag_data}{BAYESTCSPAMMY}
+                                    - $permsgstatus->{tag_data}{BAYESTCHAMMY};
+                  my $tcount_new = $permsgstatus->{tag_data}{BAYESTC}
+                                    - $permsgstatus->{tag_data}{BAYESTCLEARNED};
+                  "Tokens: new, $tcount_new; "
+                    ."hammy, $permsgstatus->{tag_data}{BAYESTCHAMMY}; "
+                    ."neutral, $tcount_neutral; "
+                    ."spammy, $permsgstatus->{tag_data}{BAYESTCSPAMMY}."
+                } else {
+                  "Bayes not run.";
+                }
+            });
+
+
+  return $score;
+}
+
+###########################################################################
+
+# Plugin hook.
+sub learner_dump_database {
+  my ($self, $params) = @_;
+  my $magic = $params->{magic};
+  my $toks = $params->{toks};
+  my $regex = $params->{regex};
+
+  # allow dump to occur even if use_bayes disables everything else ...
+  #return 0 unless $self->{conf}->{use_bayes};
+  return 0 unless $self->{store}->tie_db_readonly();
+  
+  my @vars = $self->{store}->get_storage_variables();
+
+  my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars;
+
+  my $template = '%3.3f %10u %10u %10u  %s'."\n";
+
+  if ( $magic ) {
+    printf ($template, 0.0, 0, $bv, 0, 'non-token data: bayes db version');
+    printf ($template, 0.0, 0, $ns, 0, 'non-token data: nspam');
+    printf ($template, 0.0, 0, $nh, 0, 'non-token data: nham');
+    printf ($template, 0.0, 0, $nt, 0, 'non-token data: ntokens');
+    printf ($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime');
+    printf ($template, 0.0, 0, $na, 0, 'non-token data: newest atime') if ( $bv >= 2 );
+    printf ($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count') if ( $bv < 2 );
+    printf ($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime') if ( $bv >= 2 );
+    printf ($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime');
+    if ( $bv >= 2 ) {
+      printf ($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta');
+      printf ($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count');
+    }
+  }
+
+  if ( $toks ) {
+    # let the store sort out the db_toks
+    $self->{store}->dump_db_toks($template, $regex, @vars);
+  }
+
+  if (!$self->{main}->{learn_caller_will_untie}) {
+    $self->{store}->untie_db();
+  }
+  return 1;
+}
+
+###########################################################################
+# TODO: these are NOT public, but the test suite needs to call them.
+
+sub get_msgid {
+  my ($self, $msg) = @_;
+
+  my @msgid;
+
+  my $msgid = $msg->get_header("Message-Id");
+  if (defined $msgid && $msgid ne '' && $msgid !~ /^\s*<\s*(?:\@sa_generated)?>.*$/) {
+    # remove \r and < and > prefix/suffixes
+    chomp $msgid;
+    $msgid =~ s/^<//; $msgid =~ s/>.*$//g;
+    push(@msgid, $msgid);
+  }
+
+  # Use sha1_hex(Date:, last received: and top N bytes of body)
+  # where N is MIN(1024 bytes, 1/2 of body length)
+  #
+  my $date = $msg->get_header("Date");
+  $date = "None" if (!defined $date || $date eq ''); # No Date?
+
+  my @rcvd = $msg->get_header("Received");
+  my $rcvd = $rcvd[$#rcvd];
+  $rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?
+
+  # Make a copy since pristine_body is a reference ...
+  my $body = join('', $msg->get_pristine_body());
+  if (length($body) > 64) { # Small Body?
+    my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );
+    substr($body, $keep) = '';
+  }
+
+  unshift(@msgid, sha1_hex($date."\000".$rcvd."\000".$body).'@sa_generated');
+
+  return wantarray ? @msgid : $msgid[0];
+}
+
+sub get_body_from_msg {
+  my ($self, $msg) = @_;
+
+  if (!ref $msg) {
+    # I have no idea why this seems to happen. TODO
+    warn "bayes: msg not a ref: '$msg'";
+    return { };
+  }
+
+  my $permsgstatus =
+        Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg);
+  $msg->extract_message_metadata ($permsgstatus);
+  my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
+  $permsgstatus->finish();
+
+  if (!defined $msgdata) {
+    # why?!
+    warn "bayes: failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n";
+    return { };
+  }
+
+  return $msgdata;
+}
+
+sub _get_msgdata_from_permsgstatus {
+  my ($self, $msg) = @_;
+
+  my $msgdata = { };
+  $msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array();
+  $msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array();
+  @{$msgdata->{bayes_token_uris}} = $msg->get_uri_list();
+  return $msgdata;
+}
+
+###########################################################################
+
+# The calling functions expect a uniq'ed array of tokens ...
+sub tokenize {
+  my ($self, $msg, $msgdata) = @_;
+
+  # the body
+  my @tokens = map { $self->_tokenize_line ($_, '', 1) }
+                                    @{$msgdata->{bayes_token_body}};
+
+  # the URI list
+  push (@tokens, map { $self->_tokenize_line ($_, '', 2) }
+                                    @{$msgdata->{bayes_token_uris}});
+
+  # add invisible tokens
+  if (ADD_INVIZ_TOKENS_I_PREFIX) {
+    push (@tokens, map { $self->_tokenize_line ($_, "I*:", 1) }
+                                    @{$msgdata->{bayes_token_inviz}});
+  }
+  if (ADD_INVIZ_TOKENS_NO_PREFIX) {
+    push (@tokens, map { $self->_tokenize_line ($_, "", 1) }
+                                    @{$msgdata->{bayes_token_inviz}});
+  }
+
+  # Tokenize the headers
+  my %hdrs = $self->_tokenize_headers ($msg);
+  while( my($prefix, $value) = each %hdrs ) {
+    push(@tokens, $self->_tokenize_line ($value, "H$prefix:", 0));
+  }
+
+  # Go ahead and uniq the array, skip null tokens (can happen sometimes)
+  # generate an SHA1 hash and take the lower 40 bits as our token
+  my %tokens;
+  foreach my $token (@tokens) {
+    next unless length($token); # skip 0 length tokens
+    $tokens{substr(sha1($token), -5)} = $token;
+  }
+
+  # return the keys == tokens ...
+  return \%tokens;
+}
+
+sub _tokenize_line {
+  my $self = $_[0];
+  my $tokprefix = $_[2];
+  my $region = $_[3];
+  local ($_) = $_[1];
+
+  my @rettokens;
+
+  # include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
+  # and ISO-8859-15 alphas.  Do not split on @'s; better results keeping it.
+  # Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"
+  tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;
+
+  # DO split on "..." or "--" or "---"; common formatting error resulting in
+  # hapaxes.  Keep the separator itself as a token, though, as long ones can
+  # be good spamsigns.
+  s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs;
+  s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs;
+
+  if (IGNORE_TITLE_CASE) {
+    if ($region == 1 || $region == 2) {
+      # lower-case Title Case at start of a full-stop-delimited line (as would
+      # be seen in a Western language).
+      s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '. (lc $1) . $2 . ' ' /ge;
+    }
+  }
+
+  my $magic_re = $self->{store}->get_magic_re();
+
+  foreach my $token (split) {
+    $token =~ s/^[-'"\.,]+//;        # trim non-alphanum chars at start or end
+    $token =~ s/[-'"\.,]+$//;        # so we don't get loads of '"foo' tokens
+
+    # Skip false magic tokens
+    # TVD: we need to do a defined() check since SQL doesn't have magic
+    # tokens, so the SQL BayesStore returns undef.  I really want a way
+    # of optimizing that out, but I haven't come up with anything yet.
+    #
+    next if ( defined $magic_re && $token =~ /$magic_re/ );
+
+    # *do* keep 3-byte tokens; there's some solid signs in there
+    my $len = length($token);
+
+    # but extend the stop-list. These are squarely in the gray
+    # area, and it just slows us down to record them.
+    # See http://wiki.apache.org/spamassassin/BayesStopList for more info.
+    #
+    next if $len < 3 ||
+	($token =~ /^(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$/i);
+
+    # are we in the body?  If so, apply some body-specific breakouts
+    if ($region == 1 || $region == 2) {
+      if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) {
+	push (@rettokens, $self->_tokenize_mail_addrs ($token));
+      }
+      elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) {
+	push (@rettokens, "UD:".$token); # the full token
+	my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) {
+	  push (@rettokens, "UD:".$1); # UD = URL domain
+	}
+      }
+    }
+
+    # note: do not trim down overlong tokens if they contain '*'.  This is
+    # used as part of split tokens such as "HTo:D*net" indicating that 
+    # the domain ".net" appeared in the To header.
+    #
+    if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) {
+      if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
+	# Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
+	# but I'm doing tuples to keep the dbs small(er)."  Sounds like a plan
+	# to me! (jm)
+	while ($token =~ s/^(..?)//) {
+	  push (@rettokens, "8:$1");
+	}
+	next;
+      }
+
+      if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
+            || ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS)
+            || ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS))
+      {
+	# if (TOKENIZE_LONG_TOKENS_AS_SKIPS)
+	# Spambayes trick via Matt: Just retain 7 chars.  Do not retain
+	# the length, it does not help; see my mail to -devel of Nov 20 2002.
+	# "sk:" stands for "skip".
+	$token = "sk:".substr($token, 0, 7);
+      }
+    }
+
+    # decompose tokens?  do this after shortening long tokens
+    if ($region == 1 || $region == 2) {
+      if (DECOMPOSE_BODY_TOKENS) {
+        if ($token =~ /[^\w:\*]/) {
+          my $decompd = $token;                        # "Foo!"
+          $decompd =~ s/[^\w:\*]//gs;
+          push (@rettokens, $tokprefix.$decompd);      # "Foo"
+        }
+
+        if ($token =~ /[A-Z]/) {
+          my $decompd = $token; $decompd = lc $decompd;
+          push (@rettokens, $tokprefix.$decompd);      # "foo!"
+
+          if ($token =~ /[^\w:\*]/) {
+            $decompd =~ s/[^\w:\*]//gs;
+            push (@rettokens, $tokprefix.$decompd);    # "foo"
+          }
+        }
+      }
+    }
+
+    push (@rettokens, $tokprefix.$token);
+  }
+
+  return @rettokens;
+}
+
+sub _tokenize_headers {
+  my ($self, $msg) = @_;
+
+  my %parsed;
+
+  my %user_ignore;
+  $user_ignore{lc $_} = 1 for @{$self->{main}->{conf}->{bayes_ignore_headers}};
+
+  # get headers in array context
+  my @hdrs;
+  my @rcvdlines;
+  for ($msg->get_all_headers()) {
+    # first, keep a copy of Received headers, so we can strip down to last 2
+    if (/^Received:/i) {
+      push(@rcvdlines, $_);
+      next;
+    }
+    # and now skip lines for headers we don't want (including all Received)
+    next if /^${IGNORED_HDRS}:/i;
+    next if IGNORE_MSGID_TOKENS && /^Message-ID:/i;
+    push(@hdrs, $_);
+  }
+  push(@hdrs, $msg->get_all_metadata());
+
+  # and re-add the last 2 received lines: usually a good source of
+  # spamware tokens and HELO names.
+  if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); }
+  if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); }
+
+  for (@hdrs) {
+    next unless /\S/;
+    my ($hdr, $val) = split(/:/, $_, 2);
+
+    # remove user-specified headers here, after Received, in case they
+    # want to ignore that too
+    next if exists $user_ignore{lc $hdr};
+
+    # Prep the header value
+    $val ||= '';
+    chomp($val);
+
+    # special tokenization for some headers:
+    if ($hdr =~ /^(?:|X-|Resent-)Message-Id$/i) {
+      $val = $self->_pre_chew_message_id ($val);
+    }
+    elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:|X-|Resent-)
+	(?:Return-Path|From|To|Cc|Reply-To|Errors-To|Mail-Followup-To|Sender)$/ix)
+    {
+      $val = $self->_pre_chew_addr_header ($val);
+    }
+    elsif ($hdr eq 'Received') {
+      $val = $self->_pre_chew_received ($val);
+    }
+    elsif ($hdr eq 'Content-Type') {
+      $val = $self->_pre_chew_content_type ($val);
+    }
+    elsif ($hdr eq 'MIME-Version') {
+      $val =~ s/1\.0//;		# totally innocuous
+    }
+    elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) {
+      $val = "1"; # just mark the presence, they create lots of hapaxen
+    }
+
+    if (MAP_HEADERS_MID) {
+      if ($hdr =~ /^(?:In-Reply-To|References|Message-ID)$/i) {
+        $parsed{"*MI"} = $val;
+      }
+    }
+    if (MAP_HEADERS_FROMTOCC) {
+      if ($hdr =~ /^(?:From|To|Cc)$/i) {
+        $parsed{"*Ad"} = $val;
+      }
+    }
+    if (MAP_HEADERS_USERAGENT) {
+      if ($hdr =~ /^(?:X-Mailer|User-Agent)$/i) {
+        $parsed{"*UA"} = $val;
+      }
+    }
+
+    # replace hdr name with "compressed" version if possible
+    if (defined $HEADER_NAME_COMPRESSION{$hdr}) {
+      $hdr = $HEADER_NAME_COMPRESSION{$hdr};
+    }
+
+    if (exists $parsed{$hdr}) {
+      $parsed{$hdr} .= " ".$val;
+    } else {
+      $parsed{$hdr} = $val;
+    }
+    if (would_log('dbg', 'bayes') > 1) {
+      dbg("bayes: header tokens for $hdr = \"$parsed{$hdr}\"");
+    }
+  }
+
+  return %parsed;
+}
+
+sub _pre_chew_content_type {
+  my ($self, $val) = @_;
+
+  # hopefully this will retain good bits without too many hapaxen
+  if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) {
+    my $boundary = $1;
+    $boundary =~ s/[a-fA-F0-9]/H/gs;
+    # break up blocks of separator chars so they become their own tokens
+    $boundary =~ s/([-_\.=]+)/ $1 /gs;
+    $val .= $boundary;
+  }
+
+  # stop-list words for Content-Type header: these wind up totally gray
+  $val =~ s/\b(?:text|charset)\b//;
+
+  $val;
+}
+
+sub _pre_chew_message_id {
+  my ($self, $val) = @_;
+  # we can (a) get rid of a lot of hapaxen and (b) increase the token
+  # specificity by pre-parsing some common formats.
+
+  # Outlook Express format:
+  $val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$
+           ([0-9a-f]{4})[0-9a-f]{4}\$
+           ([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx;
+
+  # Exim:
+  $val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//;
+
+  # Sendmail:
+  $val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\.
+           [A-F0-9]{10,12}\@//gx;
+
+  # try to split Message-ID segments on probable ID boundaries. Note that
+  # Outlook message-ids seem to contain a server identifier ID in the last
+  # 8 bytes before the @.  Make sure this becomes its own token, it's a
+  # great spam-sign for a learning system!  Be sure to split on ".".
+  $val =~ s/[^_A-Za-z0-9]/ /g;
+  $val;
+}
+
+sub _pre_chew_received {
+  my ($self, $val) = @_;
+
+  # Thanks to Dan for these.  Trim out "useless" tokens; sendmail-ish IDs
+  # and valid-format RFC-822/2822 dates
+
+  $val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs;  # Sendmail
+  $val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs;  # Sendmail
+  $val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs;    # Sendmail
+  $val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim
+
+  $val =~ s/(?:(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)?
+           [0-3\s]?[0-9]\s
+           (?:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s
+           (?:19|20)?[0-9]{2}\s
+           [0-2][0-9](?:\:[0-5][0-9]){1,2}\s
+           (?:\s*\(|\)|\s*(?:[+-][0-9]{4})|\s*(?:UT|[A-Z]{2,3}T))*
+           //gx;
+
+  # IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for
+  # IPs in the 10 and 192.168 ranges, they gets lots of significant tokens
+  # (on both sides)
+  # also make a dup with the full IP, as fodder for
+  # bayes_dump_to_trusted_networks: "H*r:ip*aaa.bbb.ccc.ddd"
+  $val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{
+           if ($2 eq '10' || ($2 eq '192' && $3 eq '168')) {
+             $1.$2.$3.$4.
+		" ip*".$1.$2.$3.$4." ";
+           } else {
+             $1.$2.$3.
+		" ip*".$1.$2.$3.$4." ";
+           }
+         }gex;
+
+  # trim these: they turn out as the most common tokens, but with a
+  # prob of about .5.  waste of space!
+  $val =~ s/\b(?:with|from|for|SMTP|ESMTP)\b/ /g;
+
+  $val;
+}
+
+sub _pre_chew_addr_header {
+  my ($self, $val) = @_;
+  local ($_);
+
+  my @addrs = $self->{main}->find_all_addrs_in_line ($val);
+  my @toks;
+  foreach (@addrs) {
+    push (@toks, $self->_tokenize_mail_addrs ($_));
+  }
+  return join (' ', @toks);
+}
+
+sub _tokenize_mail_addrs {
+  my ($self, $addr) = @_;
+
+  ($addr =~ /(.+)\@(.+)$/) or return ();
+  my @toks;
+  push(@toks, "U*".$1, "D*".$2);
+  $_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); }
+  return @toks;
+}
+
+
+###########################################################################
+
+# compute the probability that a token is spammish
+sub _compute_prob_for_token {
+  my ($self, $token, $ns, $nn, $s, $n) = @_;
+
+  # we allow the caller to give us the token information, just
+  # to save a potentially expensive lookup
+  if (!defined($s) || !defined($n)) {
+    ($s, $n, undef) = $self->{store}->tok_get ($token);
+  }
+
+  return if ($s == 0 && $n == 0);
+
+  if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
+    return if ($s + $n < 10);      # ignore low-freq tokens
+  }
+
+  if (!$self->{use_hapaxes}) {
+    return if ($s + $n < 2);
+  }
+
+  return if ( $ns == 0 || $nn == 0 );
+
+  my $ratios = ($s / $ns);
+  my $ration = ($n / $nn);
+
+  my $prob;
+
+  if ($ratios == 0 && $ration == 0) {
+    warn "bayes: oops? ratios == ration == 0";
+    return;
+  } else {
+    $prob = ($ratios) / ($ration + $ratios);
+  }
+
+  if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
+    # use Robinson's f(x) equation for low-n tokens, instead of just
+    # ignoring them
+    my $robn = $s+$n;
+    $prob = ($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))
+                             /
+            ($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);
+  }
+
+  # 'log_raw_counts' is used to log the raw data for the Bayes equations during
+  # a mass-check, allowing the S and X constants to be optimized quickly
+  # without requiring re-tokenization of the messages for each attempt. There's
+  # really no need for this code to be uncommented in normal use, however.   It
+  # has never been publicly documented, so commenting it out is fine. ;)
+
+  ## if ($self->{log_raw_counts}) {
+  ## $self->{raw_counts} .= " s=$s,n=$n ";
+  ## }
+
+  return $prob;
+}
+
+###########################################################################
+# If a token is neither hammy nor spammy, return 0.
+# For a spammy token, return the minimum number of additional ham messages
+# it would have had to appear in to no longer be spammy.  Hammy tokens
+# are handled similarly.  That's what the function does (at the time
+# of this writing, 31 July 2003, 16:02:55 CDT).  It would be slightly
+# more useful if it returned the number of /additional/ ham messages
+# a spammy token would have to appear in to no longer be spammy but I
+# fear that might require the solution to a cubic equation, and I
+# just don't have the time for that now.
+
+sub _compute_declassification_distance {
+  my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_;
+
+  return 0 if $ns == 0 && $nn == 0;
+
+  if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);}
+  if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);}
+
+  return 0 if $Ns == 0 || $Nn == 0;
+  return 0 if abs( $prob - 0.5 ) <
+                $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
+
+  my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn);
+  my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
+
+  return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na
+    unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS;
+
+  my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT;
+  my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X;
+  my $a = $Nb * ( 1 - $p );
+  my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb;
+  my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) );
+  my $discrim = $b * $b - 4 * $a * $c;
+  my $disc_max_0 = $discrim < 0 ? 0 : $discrim;
+  my $dd_exact = ( 1.0 - 1e-6 + ( -$b + sqrt( $disc_max_0 ) ) / ( 2*$a ) ) - $na;
+
+  # This shouldn't be necessary.  Should not be < 1
+  return $dd_exact < 1 ? 1 : int($dd_exact);
+}
+
+###########################################################################
+
+sub _opportunistic_calls {
+  my($self, $journal_only) = @_;
+
+  # If we're not already tied, abort.
+  if (!$self->{store}->db_readable()) {
+    dbg("bayes: opportunistic call attempt failed, DB not readable");
+    return;
+  }
+
+  # Is an expire or sync running?
+  my $running_expire = $self->{store}->get_running_expire_tok();
+  if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) {
+    dbg("bayes: opportunistic call attempt skipped, found fresh running expire magic token");
+    return;
+  }
+
+  # handle expiry and syncing
+  if (!$journal_only && $self->{store}->expiry_due()) {
+    dbg("bayes: opportunistic call found expiry due");
+
+    # sync will bring the DB R/W as necessary, and the expire will remove
+    # the running_expire token, may untie as well.
+    $self->{main}->{bayes_scanner}->sync(1,1);
+  }
+  elsif ( $self->{store}->sync_due() ) {
+    dbg("bayes: opportunistic call found journal sync due");
+
+    # sync will bring the DB R/W as necessary, may untie as well
+    $self->{main}->{bayes_scanner}->sync(1,0);
+
+    # We can only remove the running_expire token if we're doing R/W
+    if ($self->{store}->db_writable()) {
+      $self->{store}->remove_running_expire_tok();
+    }
+  }
+
+  return;
+}
+
+###########################################################################
+
+sub learner_new {
+  my ($self) = @_;
+
+  if ($self->{conf}->{bayes_store_module}) {
+    my $module = $self->{conf}->{bayes_store_module};
+    $module = untaint_var($module);  # good enough?
+    my $store;
+
+    eval '
+      require '.$module.';
+      $store = '.$module.'->new($self);
+      1;
+    ' or do {
+      my $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat;
+      die "bayes: (in new) $eval_stat\n";
+    };
+    $self->{store} = $store;
+  }
+  else {
+    require Mail::SpamAssassin::BayesStore::DBM;
+    $self->{store} = Mail::SpamAssassin::BayesStore::DBM->new($self);
+  }
+
+  $self;
+}
+
+###########################################################################
+
+sub bayes_report_make_list {
+  my ($self, $pms, $info, $param) = @_;
+  return "Tokens not available." unless defined $info;
+
+  my ($limit,$fmt_arg,$more) = split /,/, ($param || '5');
+
+  my %formats = (
+      short => '$t',
+      Short => 'Token: \"$t\"',
+      compact => '$p-$D--$t',
+      Compact => 'Probability $p -declassification distance $D (\"+\" means > 9) --token: \"$t\"',
+      medium => '$p-$D-$N--$t',
+      long => '$p-$d--${h}h-${s}s--${a}d--$t',
+      Long => 'Probability $p -declassification distance $D --in ${h} ham messages -and ${s} spam messages --${a} days old--token:\"$t\"'
+    );
+
+  my $raw_fmt = (!$fmt_arg ? '$p-$D--$t' : $formats{$fmt_arg});
+
+  return "Invalid format, must be one of: ".join(",",keys %formats)
+    unless defined $raw_fmt;
+
+  my $fmt = '"'.$raw_fmt.'"';
+  my $amt = $limit < @$info ? $limit : @$info;
+  return "" unless $amt;
+
+  my $ns = $pms->{bayes_nspam};
+  my $nh = $pms->{bayes_nham};
+  my $digit = sub { $_[0] > 9 ? "+" : $_[0] };
+  my $now = time;
+
+  join ', ', map {
+    my($t,$prob,$s,$h,$u) = @$_;
+    my $a = int(($now - $u)/(3600 * 24));
+    my $d = $self->_compute_declassification_distance($ns,$nh,$s,$h,$prob);
+    my $p = sprintf "%.3f", $prob;
+    my $n = $s + $h;
+    my ($c,$o) = $prob < 0.5 ? ($h,$s) : ($s,$h);
+    my ($D,$S,$H,$C,$O,$N) = map &$digit($_), ($d,$s,$h,$c,$o,$n);
+    eval $fmt;  ## no critic
+  } @{$info}[0..$amt-1];
+}
+
 1;
+
+=back
+
+=cut
Index: lib/Mail/SpamAssassin/Conf.pm
===================================================================
--- lib/Mail/SpamAssassin/Conf.pm	(.../trunk)	(revision 602913)
+++ lib/Mail/SpamAssassin/Conf.pm	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -1278,6 +1278,20 @@
 
 =over 4
 
+=item use_learner ( 0 | 1 )		(default: 1)
+
+Whether to use any machine-learning classifiers with SpamAssassin, such as the
+default 'BAYES_*' rules.  Setting this to 0 will disable use of any and all
+human-trained classifiers.
+
+=cut
+
+  push (@cmds, {
+    setting => 'use_learner',
+    default => 1,
+    type => $CONF_TYPE_BOOL
+  });
+
 =item use_bayes ( 0 | 1 )		(default: 1)
 
 Whether to use the naive-Bayesian-style classifier built into
@@ -2629,6 +2643,77 @@
     }
   });
 
+##OSBF##=item osbf_path /path/filename	(default: ~/.spamassassin/osbf)
+##OSBF##
+##OSBF##This is the directory and filename for Bayes databases.  Several databases
+##OSBF##will be created, with this as the base directory and filename, with C<_toks>,
+##OSBF##C<_seen>, etc. appended to the base.  The default setting results in files
+##OSBF##called C<~/.spamassassin/osbf_seen>, C<~/.spamassassin/osbf_toks>, etc.
+##OSBF##
+##OSBF##By default, each user has their own in their C<~/.spamassassin> directory with
+##OSBF##mode 0700/0600.  For system-wide SpamAssassin use, you may want to reduce disk
+##OSBF##space usage by sharing this across all users.  However, Bayes appears to be
+##OSBF##more effective with individual user databases.
+##OSBF##
+##OSBF##=cut
+##OSBF##
+##OSBF##  push (@cmds, {
+##OSBF##    setting => 'osbf_path',
+##OSBF##    is_admin => 1,
+##OSBF##    default => '__userstate__/osbf',
+##OSBF##    code => sub {
+##OSBF##      my ($self, $key, $value, $line) = @_;
+##OSBF##      unless (defined $value && $value !~ /^$/) {
+##OSBF##	return $MISSING_REQUIRED_VALUE;
+##OSBF##      }
+##OSBF##      if (-d $value) {
+##OSBF##	return $INVALID_VALUE;
+##OSBF##      }
+##OSBF##     $self->{osbf_path} = $value;
+##OSBF##    }
+##OSBF##  });
+##OSBF##
+##OSBF##=item osbf_file_mode		(default: 0700)
+##OSBF##
+##OSBF##The file mode bits used for the Bayesian filtering database files.
+##OSBF##
+##OSBF##Make sure you specify this using the 'x' mode bits set, as it may also be used
+##OSBF##to create directories.  However, if a file is created, the resulting file will
+##OSBF##not have any execute bits set (the umask is set to 111). The argument is a
+##OSBF##string of octal digits, it is converted to a numeric value internally.
+##OSBF##
+##OSBF##=cut
+##OSBF##
+##OSBF##  push (@cmds, {
+##OSBF##    setting => 'osbf_file_mode',
+##OSBF##    is_admin => 1,
+##OSBF##    default => '0700',
+##OSBF##    type => $CONF_TYPE_NUMERIC
+##OSBF##  });
+##OSBF##
+##OSBF##=item osbf_store_module Name::Of::BayesStore::Module
+##OSBF##
+##OSBF##If this option is set, the module given will be used as an alternate
+##OSBF##to the default osbf storage mechanism.  It must conform to the
+##OSBF##published storage specification (see
+##OSBF##Mail::SpamAssassin::BayesStore). For example, set this to
+##OSBF##Mail::SpamAssassin::BayesStore::SQL to use the generic SQL storage
+##OSBF##module.
+##OSBF##
+##OSBF##=cut
+##OSBF##
+##OSBF##  push (@cmds, {
+##OSBF##    setting => 'osbf_store_module',
+##OSBF##    is_admin => 1,
+##OSBF##    default => '',
+##OSBF##    code => sub {
+##OSBF##      my ($self, $key, $value, $line) = @_;
+##OSBF##      local ($1);
+##OSBF##      if ($value !~ /^([_A-Za-z0-9:]+)$/) { return $INVALID_VALUE; }
+##OSBF##      $self->{osbf_store_module} = $1;
+##OSBF##    }
+##OSBF##  });
+
 =item bayes_sql_dsn DBI::databasetype:databasename:hostname:port
 
 Used for BayesStore::SQL storage implementation.
Index: lib/Mail/SpamAssassin/Bayes.pm
===================================================================
--- lib/Mail/SpamAssassin/Bayes.pm	(.../trunk)	(revision 602913)
+++ lib/Mail/SpamAssassin/Bayes.pm	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -17,31 +17,17 @@
 
 =head1 NAME
 
-Mail::SpamAssassin::Bayes - determine spammishness using a Bayesian classifier
+Mail::SpamAssassin::Bayes - support for learning classifiers
 
 =head1 DESCRIPTION
 
-This is a Bayesian-like form of probability-analysis classification, using an
-algorithm based on the one detailed in Paul Graham's I<A Plan For Spam> paper
-at:
+This is the general class used to train a learning classifier with new samples
+of spam and ham mail, and classify based on prior training. 
 
-  http://www.paulgraham.com/spam.html
+Prior to version 3.3.0, the default Bayes implementation was here; if you're
+looking for information on that, it has moved to
+C<Mail::SpamAssassin::Plugin::Bayes>.
 
-It also incorporates some other aspects taken from Graham Robinson's webpage
-on the subject at:
-
-  http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
-
-And the chi-square probability combiner as described here:
-
-  http://www.linuxjournal.com/print.php?sid=6467
-
-The results are incorporated into SpamAssassin as the BAYES_* rules.
-
-=head1 METHODS
-
-=over 4
-
 =cut
 
 package Mail::SpamAssassin::Bayes;
@@ -56,172 +42,8 @@
 use Mail::SpamAssassin::Logger;
 use Mail::SpamAssassin::Util qw(untaint_var);
 
-# pick ONLY ONE of these combining implementations.
-use Mail::SpamAssassin::Bayes::CombineChi;
-# use Mail::SpamAssassin::Bayes::CombineNaiveBayes;
+our @ISA = qw();
 
-use Digest::SHA1 qw(sha1 sha1_hex);
-
-use vars qw{
-  @ISA
-  $IGNORED_HDRS
-  $MARK_PRESENCE_ONLY_HDRS
-  %HEADER_NAME_COMPRESSION
-  $OPPORTUNISTIC_LOCK_VALID
-};
-
-@ISA = qw();
-
-# Which headers should we scan for tokens?  Don't use all of them, as it's easy
-# to pick up spurious clues from some.  What we now do is use all of them
-# *less* these well-known headers; that way we can pick up spammers' tracking
-# headers (which are obviously not well-known in advance!).
-
-# Received is handled specially
-$IGNORED_HDRS = qr{(?: (?:X-)?Sender    # misc noise
-  |Delivered-To |Delivery-Date
-  |(?:X-)?Envelope-To
-  |X-MIME-Auto[Cc]onverted |X-Converted-To-Plain-Text
-
-  |Subject      # not worth a tiny gain vs. to db size increase
-
-  # Date: can provide invalid cues if your spam corpus is
-  # older/newer than ham
-  |Date
-
-  # List headers: ignore. a spamfiltering mailing list will
-  # become a nonspam sign.
-  |X-List|(?:X-)?Mailing-List
-  |(?:X-)?List-(?:Archive|Help|Id|Owner|Post|Subscribe
-    |Unsubscribe|Host|Id|Manager|Admin|Comment
-    |Name|Url)
-  |X-Unsub(?:scribe)?
-  |X-Mailman-Version |X-Been[Tt]here |X-Loop
-  |Mail-Followup-To
-  |X-eGroups-(?:Return|From)
-  |X-MDMailing-List
-  |X-XEmacs-List
-
-  # gatewayed through mailing list (thanks to Allen Smith)
-  |(?:X-)?Resent-(?:From|To|Date)
-  |(?:X-)?Original-(?:From|To|Date)
-
-  # Spamfilter/virus-scanner headers: too easy to chain from
-  # these
-  |X-MailScanner(?:-SpamCheck)?
-  |X-Spam(?:-(?:Status|Level|Flag|Report|Hits|Score|Checker-Version))?
-  |X-Antispam |X-RBL-Warning |X-Mailscanner
-  |X-MDaemon-Deliver-To |X-Virus-Scanned
-  |X-Mass-Check-Id
-  |X-Pyzor |X-DCC-\S{2,25}-Metrics
-  |X-Filtered-B[Yy] |X-Scanned-By |X-Scanner
-  |X-AP-Spam-(?:Score|Status) |X-RIPE-Spam-Status
-  |X-SpamCop-[^:]+
-  |X-SMTPD |(?:X-)?Spam-Apparently-To
-  |SPAM |X-Perlmx-Spam
-  |X-Bogosity
-
-  # some noisy Outlook headers that add no good clues:
-  |Content-Class |Thread-(?:Index|Topic)
-  |X-Original[Aa]rrival[Tt]ime
-
-  # Annotations from IMAP, POP, and MH:
-  |(?:X-)?Status |X-Flags |Replied |Forwarded
-  |Lines |Content-Length
-  |X-UIDL? |X-IMAPbase
-
-  # Annotations from Bugzilla
-  |X-Bugzilla-[^:]+
-
-  # Annotations from VM: (thanks to Allen Smith)
-  |X-VM-(?:Bookmark|(?:POP|IMAP)-Retrieved|Labels|Last-Modified
-    |Summary-Format|VHeader|v\d-Data|Message-Order)
-
-  # Annotations from Gnus:
-  | X-Gnus-Mail-Source
-  | Xref
-
-)}x;
-
-# Note only the presence of these headers, in order to reduce the
-# hapaxen they generate.
-$MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face
-  |X-(?:Gnu-?PG|PGP|GPG)(?:-Key)?-Fingerprint
-)}ix;
-
-# tweaks tested as of Nov 18 2002 by jm: see SpamAssassin-devel list archives
-# for results.  The winners are now the default settings.
-use constant IGNORE_TITLE_CASE => 1;
-use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 1;
-use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
-
-# tweaks of May 12 2003, see SpamAssassin-devel archives again.
-use constant PRE_CHEW_ADDR_HEADERS => 1;
-use constant CHEW_BODY_URIS => 1;
-use constant CHEW_BODY_MAILADDRS => 1;
-use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
-use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
-use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0;
-use constant IGNORE_MSGID_TOKENS => 0;
-
-# tweaks of 12 March 2004, see bug 2129.
-use constant DECOMPOSE_BODY_TOKENS => 1;
-use constant MAP_HEADERS_MID => 1;
-use constant MAP_HEADERS_FROMTOCC => 1;
-use constant MAP_HEADERS_USERAGENT => 1;
-
-# tweaks, see http://issues.apache.org/SpamAssassin/show_bug.cgi?id=3173#c26
-use constant ADD_INVIZ_TOKENS_I_PREFIX => 1;
-use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0;
-
-# We store header-mined tokens in the db with a "HHeaderName:val" format.
-# some headers may contain lots of gibberish tokens, so allow a little basic
-# compression by mapping the header name at least here.  these are the headers
-# which appear with the most frequency in my db.  note: this doesn't have to
-# be 2-way (ie. LHSes that map to the same RHS are not a problem), but mixing
-# tokens from multiple different headers may impact accuracy, so might as well
-# avoid this if possible. These are the top ones from my corpus, BTW (jm).
-%HEADER_NAME_COMPRESSION = (
-  'Message-Id'		=> '*m',
-  'Message-ID'		=> '*M',
-  'Received'		=> '*r',
-  'User-Agent'		=> '*u',
-  'References'		=> '*f',
-  'In-Reply-To'		=> '*i',
-  'From'		=> '*F',
-  'Reply-To'		=> '*R',
-  'Return-Path'		=> '*p',
-  'Return-path'		=> '*rp',
-  'X-Mailer'		=> '*x',
-  'X-Authentication-Warning' => '*a',
-  'Organization'	=> '*o',
-  'Organisation'        => '*o',
-  'Content-Type'	=> '*c',
-  'X-Spam-Relays-Trusted' => '*RT',
-  'X-Spam-Relays-Untrusted' => '*RU',
-);
-
-# How many seconds should the opportunistic_expire lock be valid?
-$OPPORTUNISTIC_LOCK_VALID = 300;
-
-# Should we use the Robinson f(w) equation from
-# http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html ?
-# It gives better results, in that scores are more likely to distribute
-# into the <0.5 range for nonspam and >0.5 for spam.
-use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1;
-
-# How many of the most significant tokens should we use for the p(w)
-# calculation?
-use constant N_SIGNIFICANT_TOKENS => 150;
-
-# How many significant tokens are required for a classifier score to
-# be considered usable?
-use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1;
-
-# How long a token should we hold onto?  (note: German speakers typically
-# will require a longer token than English ones.)
-use constant MAX_TOKEN_LENGTH => 15;
-
 ###########################################################################
 
 sub new {
@@ -232,450 +54,49 @@
   my $self = {
     'main'              => $main,
     'conf'		=> $main->{conf},
-    ## 'log_raw_counts'	=> 0, # see compute_prob_for_token()
     'use_ignores'       => 1,
   };
   bless ($self, $class);
 
-  if ($self->{conf}->{bayes_store_module}) {
-    my $module = $self->{conf}->{bayes_store_module};
-    $module = untaint_var($module);  # good enough?
-    my $store;
-
-    eval '
-      require '.$module.';
-      $store = '.$module.'->new($self);
-      1;
-    ' or do {
-      my $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat;
-      die "bayes: (in new) $eval_stat\n";
-    };
-    $self->{store} = $store;
-  }
-  else {
-    require Mail::SpamAssassin::BayesStore::DBM;
-    $self->{store} = Mail::SpamAssassin::BayesStore::DBM->new($self);
-  }
-
+  $self->{main}->call_plugins("learner_new");
   $self;
 }
 
+###########################################################################
+
 sub finish {
   my $self = shift;
-  #if (!$self->{conf}->{use_bayes}) { return; }
-
-  # if we're untying too much, uncomment this...
-  # use Carp qw(cluck); cluck "stack trace at untie";
-
-  $self->{store}->untie_db();
+  # we don't need to do the plugin; Mail::SpamAssassin::finish() does
+  # that for us
   %{$self} = ();
 }
 
-sub sa_die { Mail::SpamAssassin::sa_die(@_); }
-
 ###########################################################################
 
-sub sanity_check_is_untied {
+# force the Bayes dbs to be closed, if they haven't already been; called
+# at the end of scan operation, or when switching between user IDs,
+# or when C<Mail::SpamAssassin::finish_learner()> is called.
+#
+sub force_close {
   my $self = shift;
   my $quiet = shift;
-
-  # do a sanity check here.  Wierd things happen if we remain tied
-  # after compiling; for example, spamd will never see that the
-  # number of messages has reached the bayes-scanning threshold.
-  if ($self->{store}->db_readable()) {
-    warn "bayes: oops! still tied to bayes DBs, untying\n" unless $quiet;
-    $self->{store}->untie_db();
-  }
+  $self->{main}->call_plugins("learner_close", { quiet => $quiet });
 }
 
 ###########################################################################
 
-# read configuration items to control bayes behaviour.  Called by
-# BayesStore::read_db_configs().
-sub read_db_configs {
-  my ($self) = @_;
-
-  # use of hapaxes.  Set on bayes object, since it controls prob
-  # computation.
-  $self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes};
-}
-
-###########################################################################
-
-# The calling functions expect a uniq'ed array of tokens ...
-sub tokenize {
-  my ($self, $msg, $msgdata) = @_;
-
-  # the body
-  my @tokens = map { $self->tokenize_line ($_, '', 1) }
-                                    @{$msgdata->{bayes_token_body}};
-
-  # the URI list
-  push (@tokens, map { $self->tokenize_line ($_, '', 2) }
-                                    @{$msgdata->{bayes_token_uris}});
-
-  # add invisible tokens
-  if (ADD_INVIZ_TOKENS_I_PREFIX) {
-    push (@tokens, map { $self->tokenize_line ($_, "I*:", 1) }
-                                    @{$msgdata->{bayes_token_inviz}});
-  }
-  if (ADD_INVIZ_TOKENS_NO_PREFIX) {
-    push (@tokens, map { $self->tokenize_line ($_, "", 1) }
-                                    @{$msgdata->{bayes_token_inviz}});
-  }
-
-  # Tokenize the headers
-  my %hdrs = $self->tokenize_headers ($msg);
-  while( my($prefix, $value) = each %hdrs ) {
-    push(@tokens, $self->tokenize_line ($value, "H$prefix:", 0));
-  }
-
-  # Go ahead and uniq the array, skip null tokens (can happen sometimes)
-  # generate an SHA1 hash and take the lower 40 bits as our token
-  my %tokens;
-  foreach my $token (@tokens) {
-    next unless length($token); # skip 0 length tokens
-    $tokens{substr(sha1($token), -5)} = $token;
-  }
-
-  # return the keys == tokens ...
-  return \%tokens;
-}
-
-sub tokenize_line {
-  my $self = $_[0];
-  my $tokprefix = $_[2];
-  my $region = $_[3];
-  local ($_) = $_[1];
-
-  my @rettokens;
-
-  # include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
-  # and ISO-8859-15 alphas.  Do not split on @'s; better results keeping it.
-  # Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"
-  tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;
-
-  # DO split on "..." or "--" or "---"; common formatting error resulting in
-  # hapaxes.  Keep the separator itself as a token, though, as long ones can
-  # be good spamsigns.
-  s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs;
-  s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs;
-
-  if (IGNORE_TITLE_CASE) {
-    if ($region == 1 || $region == 2) {
-      # lower-case Title Case at start of a full-stop-delimited line (as would
-      # be seen in a Western language).
-      s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '. (lc $1) . $2 . ' ' /ge;
-    }
-  }
-
-  my $magic_re = $self->{store}->get_magic_re();
-
-  foreach my $token (split) {
-    $token =~ s/^[-'"\.,]+//;        # trim non-alphanum chars at start or end
-    $token =~ s/[-'"\.,]+$//;        # so we don't get loads of '"foo' tokens
-
-    # Skip false magic tokens
-    # TVD: we need to do a defined() check since SQL doesn't have magic
-    # tokens, so the SQL BayesStore returns undef.  I really want a way
-    # of optimizing that out, but I haven't come up with anything yet.
-    #
-    next if ( defined $magic_re && $token =~ /$magic_re/ );
-
-    # *do* keep 3-byte tokens; there's some solid signs in there
-    my $len = length($token);
-
-    # but extend the stop-list. These are squarely in the gray
-    # area, and it just slows us down to record them.
-    # See http://wiki.apache.org/spamassassin/BayesStopList for more info.
-    #
-    next if $len < 3 ||
-	($token =~ /^(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$/i);
-
-    # are we in the body?  If so, apply some body-specific breakouts
-    if ($region == 1 || $region == 2) {
-      if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) {
-	push (@rettokens, $self->tokenize_mail_addrs ($token));
-      }
-      elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) {
-	push (@rettokens, "UD:".$token); # the full token
-	my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) {
-	  push (@rettokens, "UD:".$1); # UD = URL domain
-	}
-      }
-    }
-
-    # note: do not trim down overlong tokens if they contain '*'.  This is
-    # used as part of split tokens such as "HTo:D*net" indicating that 
-    # the domain ".net" appeared in the To header.
-    #
-    if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) {
-      if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
-	# Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
-	# but I'm doing tuples to keep the dbs small(er)."  Sounds like a plan
-	# to me! (jm)
-	while ($token =~ s/^(..?)//) {
-	  push (@rettokens, "8:$1");
-	}
-	next;
-      }
-
-      if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
-            || ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS)
-            || ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS))
-      {
-	# if (TOKENIZE_LONG_TOKENS_AS_SKIPS)
-	# Spambayes trick via Matt: Just retain 7 chars.  Do not retain
-	# the length, it does not help; see my mail to -devel of Nov 20 2002.
-	# "sk:" stands for "skip".
-	$token = "sk:".substr($token, 0, 7);
-      }
-    }
-
-    # decompose tokens?  do this after shortening long tokens
-    if ($region == 1 || $region == 2) {
-      if (DECOMPOSE_BODY_TOKENS) {
-        if ($token =~ /[^\w:\*]/) {
-          my $decompd = $token;                        # "Foo!"
-          $decompd =~ s/[^\w:\*]//gs;
-          push (@rettokens, $tokprefix.$decompd);      # "Foo"
-        }
-
-        if ($token =~ /[A-Z]/) {
-          my $decompd = $token; $decompd = lc $decompd;
-          push (@rettokens, $tokprefix.$decompd);      # "foo!"
-
-          if ($token =~ /[^\w:\*]/) {
-            $decompd =~ s/[^\w:\*]//gs;
-            push (@rettokens, $tokprefix.$decompd);    # "foo"
-          }
-        }
-      }
-    }
-
-    push (@rettokens, $tokprefix.$token);
-  }
-
-  return @rettokens;
-}
-
-sub tokenize_headers {
-  my ($self, $msg) = @_;
-
-  my %parsed;
-
-  my %user_ignore;
-  $user_ignore{lc $_} = 1 for @{$self->{main}->{conf}->{bayes_ignore_headers}};
-
-  # get headers in array context
-  my @hdrs;
-  my @rcvdlines;
-  for ($msg->get_all_headers()) {
-    # first, keep a copy of Received headers, so we can strip down to last 2
-    if (/^Received:/i) {
-      push(@rcvdlines, $_);
-      next;
-    }
-    # and now skip lines for headers we don't want (including all Received)
-    next if /^${IGNORED_HDRS}:/i;
-    next if IGNORE_MSGID_TOKENS && /^Message-ID:/i;
-    push(@hdrs, $_);
-  }
-  push(@hdrs, $msg->get_all_metadata());
-
-  # and re-add the last 2 received lines: usually a good source of
-  # spamware tokens and HELO names.
-  if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); }
-  if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); }
-
-  for (@hdrs) {
-    next unless /\S/;
-    my ($hdr, $val) = split(/:/, $_, 2);
-
-    # remove user-specified headers here, after Received, in case they
-    # want to ignore that too
-    next if exists $user_ignore{lc $hdr};
-
-    # Prep the header value
-    $val ||= '';
-    chomp($val);
-
-    # special tokenization for some headers:
-    if ($hdr =~ /^(?:|X-|Resent-)Message-Id$/i) {
-      $val = $self->pre_chew_message_id ($val);
-    }
-    elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:|X-|Resent-)
-	(?:Return-Path|From|To|Cc|Reply-To|Errors-To|Mail-Followup-To|Sender)$/ix)
-    {
-      $val = $self->pre_chew_addr_header ($val);
-    }
-    elsif ($hdr eq 'Received') {
-      $val = $self->pre_chew_received ($val);
-    }
-    elsif ($hdr eq 'Content-Type') {
-      $val = $self->pre_chew_content_type ($val);
-    }
-    elsif ($hdr eq 'MIME-Version') {
-      $val =~ s/1\.0//;		# totally innocuous
-    }
-    elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) {
-      $val = "1"; # just mark the presence, they create lots of hapaxen
-    }
-
-    if (MAP_HEADERS_MID) {
-      if ($hdr =~ /^(?:In-Reply-To|References|Message-ID)$/i) {
-        $parsed{"*MI"} = $val;
-      }
-    }
-    if (MAP_HEADERS_FROMTOCC) {
-      if ($hdr =~ /^(?:From|To|Cc)$/i) {
-        $parsed{"*Ad"} = $val;
-      }
-    }
-    if (MAP_HEADERS_USERAGENT) {
-      if ($hdr =~ /^(?:X-Mailer|User-Agent)$/i) {
-        $parsed{"*UA"} = $val;
-      }
-    }
-
-    # replace hdr name with "compressed" version if possible
-    if (defined $HEADER_NAME_COMPRESSION{$hdr}) {
-      $hdr = $HEADER_NAME_COMPRESSION{$hdr};
-    }
-
-    if (exists $parsed{$hdr}) {
-      $parsed{$hdr} .= " ".$val;
-    } else {
-      $parsed{$hdr} = $val;
-    }
-    if (would_log('dbg', 'bayes') > 1) {
-      dbg("bayes: header tokens for $hdr = \"$parsed{$hdr}\"");
-    }
-  }
-
-  return %parsed;
-}
-
-sub pre_chew_content_type {
-  my ($self, $val) = @_;
-
-  # hopefully this will retain good bits without too many hapaxen
-  if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) {
-    my $boundary = $1;
-    $boundary =~ s/[a-fA-F0-9]/H/gs;
-    # break up blocks of separator chars so they become their own tokens
-    $boundary =~ s/([-_\.=]+)/ $1 /gs;
-    $val .= $boundary;
-  }
-
-  # stop-list words for Content-Type header: these wind up totally gray
-  $val =~ s/\b(?:text|charset)\b//;
-
-  $val;
-}
-
-sub pre_chew_message_id {
-  my ($self, $val) = @_;
-  # we can (a) get rid of a lot of hapaxen and (b) increase the token
-  # specificity by pre-parsing some common formats.
-
-  # Outlook Express format:
-  $val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$
-           ([0-9a-f]{4})[0-9a-f]{4}\$
-           ([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx;
-
-  # Exim:
-  $val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//;
-
-  # Sendmail:
-  $val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\.
-           [A-F0-9]{10,12}\@//gx;
-
-  # try to split Message-ID segments on probable ID boundaries. Note that
-  # Outlook message-ids seem to contain a server identifier ID in the last
-  # 8 bytes before the @.  Make sure this becomes its own token, it's a
-  # great spam-sign for a learning system!  Be sure to split on ".".
-  $val =~ s/[^_A-Za-z0-9]/ /g;
-  $val;
-}
-
-sub pre_chew_received {
-  my ($self, $val) = @_;
-
-  # Thanks to Dan for these.  Trim out "useless" tokens; sendmail-ish IDs
-  # and valid-format RFC-822/2822 dates
-
-  $val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs;  # Sendmail
-  $val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs;  # Sendmail
-  $val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs;    # Sendmail
-  $val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim
-
-  $val =~ s/(?:(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)?
-           [0-3\s]?[0-9]\s
-           (?:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s
-           (?:19|20)?[0-9]{2}\s
-           [0-2][0-9](?:\:[0-5][0-9]){1,2}\s
-           (?:\s*\(|\)|\s*(?:[+-][0-9]{4})|\s*(?:UT|[A-Z]{2,3}T))*
-           //gx;
-
-  # IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for
-  # IPs in the 10 and 192.168 ranges, they gets lots of significant tokens
-  # (on both sides)
-  # also make a dup with the full IP, as fodder for
-  # bayes_dump_to_trusted_networks: "H*r:ip*aaa.bbb.ccc.ddd"
-  $val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{
-           if ($2 eq '10' || ($2 eq '192' && $3 eq '168')) {
-             $1.$2.$3.$4.
-		" ip*".$1.$2.$3.$4." ";
-           } else {
-             $1.$2.$3.
-		" ip*".$1.$2.$3.$4." ";
-           }
-         }gex;
-
-  # trim these: they turn out as the most common tokens, but with a
-  # prob of about .5.  waste of space!
-  $val =~ s/\b(?:with|from|for|SMTP|ESMTP)\b/ /g;
-
-  $val;
-}
-
-sub pre_chew_addr_header {
-  my ($self, $val) = @_;
-  local ($_);
-
-  my @addrs = $self->{main}->find_all_addrs_in_line ($val);
-  my @toks;
-  foreach (@addrs) {
-    push (@toks, $self->tokenize_mail_addrs ($_));
-  }
-  return join (' ', @toks);
-}
-
-sub tokenize_mail_addrs {
-  my ($self, $addr) = @_;
-
-  ($addr =~ /(.+)\@(.+)$/) or return ();
-  my @toks;
-  push(@toks, "U*".$1, "D*".$2);
-  $_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); }
-  return @toks;
-}
-
-###########################################################################
-
 sub ignore_message {
   my ($self,$PMS) = @_;
 
   return 0 unless $self->{use_ignores};
 
-  my $ig_from = $self->{main}->call_plugins ("check_wb_list", { permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' });
-  my $ig_to = $self->{main}->call_plugins ("check_wb_list", { permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' });
+  my $ig_from = $self->{main}->call_plugins ("check_wb_list",
+        { permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' });
+  my $ig_to = $self->{main}->call_plugins ("check_wb_list", 
+        { permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' });
 
   my $ignore = $ig_from || $ig_to;
-
   dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore;
-
   return $ignore;
 }
 
@@ -683,10 +104,9 @@
 
 sub learn {
   my ($self, $isspam, $msg, $id) = @_;
+  return unless $self->{conf}->{use_learner};
+  return unless defined $msg;
 
-  if (!$self->{conf}->{use_bayes}) { return; }
-  if (!defined $msg) { return; }
-
   if( $self->{use_ignores} )  # Remove test when PerMsgStatus available.
   {
     # DMK, koppel@ece.lsu.edu:  Hoping that the ultimate fix to bug 2263 will
@@ -697,321 +117,29 @@
     return 0 if $ignore;
   }
 
-  my $msgdata = $self->get_body_from_msg ($msg);
-  my $ret;
-
-  eval {
-    local $SIG{'__DIE__'};	# do not run user die() traps in here
-
-    my $ok;
-    if ($self->{main}->{learn_to_journal}) {
-      # If we're going to learn to journal, we'll try going r/o first...
-      # If that fails for some reason, let's try going r/w.  This happens
-      # if the DB doesn't exist yet.
-      $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();
-    } else {
-      $ok = $self->{store}->tie_db_writable();
-    }
-
-    if ($ok) {
-      $ret = $self->learn_trapped ($isspam, $msg, $msgdata, $id);
-
-      if (!$self->{main}->{learn_caller_will_untie}) {
-        $self->{store}->untie_db();
-      }
-    }
-    1;
-  } or do {		# if we died, untie the dbs.
-    my $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat;
-    $self->{store}->untie_db();
-    die "bayes: (in learn) $eval_stat\n";
-  };
-
-  return $ret;
+  return $self->{main}->call_plugins("learn_message", { isspam => $isspam, msg => $msg, id => $id });
 }
 
-# this function is trapped by the wrapper above
-sub learn_trapped {
-  my ($self, $isspam, $msg, $msgdata, $msgid) = @_;
-  my @msgid = ( $msgid );
-
-  if (!defined $msgid) {
-    @msgid = $self->get_msgid($msg);
-  }
-
-  foreach $msgid ( @msgid ) {
-    my $seen = $self->{store}->seen_get ($msgid);
-
-    if (defined ($seen)) {
-      if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) {
-        dbg("bayes: $msgid already learnt correctly, not learning twice");
-        return 0;
-      } elsif ($seen !~ /^[hs]$/) {
-        warn("bayes: db_seen corrupt: value='$seen' for $msgid, ignored");
-      } else {
-        # bug 3704: If the message was already learned, don't try learning it again.
-        # this prevents, for instance, manually learning as spam, then autolearning
-        # as ham, or visa versa.
-        if ($self->{main}->{learn_no_relearn}) {
-	  dbg("bayes: $msgid already learnt as opposite, not re-learning");
-	  return 0;
-	}
-
-        dbg("bayes: $msgid already learnt as opposite, forgetting first");
-
-        # kluge so that forget() won't untie the db on us ...
-        my $orig = $self->{main}->{learn_caller_will_untie};
-        $self->{main}->{learn_caller_will_untie} = 1;
-
-        my $fatal = !defined $self->forget ($msg);
-
-        # reset the value post-forget() ...
-        $self->{main}->{learn_caller_will_untie} = $orig;
-    
-        # forget() gave us a fatal error, so propagate that up
-        if ($fatal) {
-          dbg("bayes: forget() returned a fatal error, so learn() will too");
-	  return;
-        }
-      }
-
-      # we're only going to have seen this once, so stop if it's been
-      # seen already
-      last;
-    }
-  }
-
-  # Now that we're sure we haven't seen this message before ...
-  $msgid = $msgid[0];
-
-  if ($isspam) {
-    $self->{store}->nspam_nham_change (1, 0);
-  } else {
-    $self->{store}->nspam_nham_change (0, 1);
-  }
-
-  my $msgatime = $msg->receive_date();
-
-  # If the message atime comes back as being more than 1 day in the
-  # future, something's messed up and we should revert to current time as
-  # a safety measure.
-  #
-  $msgatime = time if ( $msgatime - time > 86400 );
-
-  my $tokens = $self->tokenize($msg, $msgdata);
-
-  if ($isspam) {
-    $self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime);
-  } else {
-    $self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime);
-  }
-
-  $self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h'));
-  $self->{store}->cleanup();
-
-  $self->{main}->call_plugins("bayes_learn", { toksref => $tokens,
-					       isspam => $isspam,
-					       msgid => $msgid,
-					       msgatime => $msgatime,
-					     });
-
-  dbg("bayes: learned '$msgid', atime: $msgatime");
-
-  1;
-}
-
 ###########################################################################
 
 sub forget {
   my ($self, $msg, $id) = @_;
-
-  if (!$self->{conf}->{use_bayes}) { return; }
-  if (!defined $msg) { return; }
-
-  my $msgdata = $self->get_body_from_msg ($msg);
-  my $ret;
-
-  # we still tie for writing here, since we write to the seen db
-  # synchronously
-  eval {
-    local $SIG{'__DIE__'};	# do not run user die() traps in here
-
-    my $ok;
-    if ($self->{main}->{learn_to_journal}) {
-      # If we're going to learn to journal, we'll try going r/o first...
-      # If that fails for some reason, let's try going r/w.  This happens
-      # if the DB doesn't exist yet.
-      $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();
-    } else {
-      $ok = $self->{store}->tie_db_writable();
-    }
-
-    if ($ok) {
-      $ret = $self->forget_trapped ($msg, $msgdata, $id);
-
-      if (!$self->{main}->{learn_caller_will_untie}) {
-        $self->{store}->untie_db();
-      }
-    }
-    1;
-  } or do {		# if we died, untie the dbs.
-    my $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat;
-    $self->{store}->untie_db();
-    die "bayes: (in forget) $eval_stat\n";
-  };
-
-  return $ret;
+  return unless $self->{conf}->{use_learner};
+  return unless defined $msg;
+  return $self->{main}->call_plugins("forget_message", { msg => $msg, id => $id });
 }
 
-# this function is trapped by the wrapper above
-sub forget_trapped {
-  my ($self, $msg, $msgdata, $msgid) = @_;
-  my @msgid = ( $msgid );
-  my $isspam;
-
-  if (!defined $msgid) {
-    @msgid = $self->get_msgid($msg);
-  }
-
-  while( $msgid = shift @msgid ) {
-    my $seen = $self->{store}->seen_get ($msgid);
-
-    if (defined ($seen)) {
-      if ($seen eq 's') {
-        $isspam = 1;
-      } elsif ($seen eq 'h') {
-        $isspam = 0;
-      } else {
-        dbg("bayes: forget: msgid $msgid seen entry is neither ham nor spam, ignored");
-        return 0;
-      }
-
-      # messages should only be learned once, so stop if we find a msgid
-      # which was seen before
-      last;
-    }
-    else {
-      dbg("bayes: forget: msgid $msgid not learnt, ignored");
-    }
-  }
-
-  # This message wasn't learnt before, so return
-  if (!defined $isspam) {
-    dbg("bayes: forget: no msgid from this message has been learnt, skipping message");
-    return 0;
-  }
-  elsif ($isspam) {
-    $self->{store}->nspam_nham_change (-1, 0);
-  }
-  else {
-    $self->{store}->nspam_nham_change (0, -1);
-  }
-
-  my $tokens = $self->tokenize($msg, $msgdata);
-
-  if ($isspam) {
-    $self->{store}->multi_tok_count_change (-1, 0, $tokens);
-  } else {
-    $self->{store}->multi_tok_count_change (0, -1, $tokens);
-  }
-
-  $self->{store}->seen_delete ($msgid);
-  $self->{store}->cleanup();
-
-  $self->{main}->call_plugins("bayes_forget", { toksref => $tokens,
-						isspam => $isspam,
-						msgid => $msgid,
-					      });
-
-  1;
-}
-
 ###########################################################################
 
-sub get_msgid {
-  my ($self, $msg) = @_;
-
-  my @msgid;
-
-  my $msgid = $msg->get_header("Message-Id");
-  if (defined $msgid && $msgid ne '' && $msgid !~ /^\s*<\s*(?:\@sa_generated)?>.*$/) {
-    # remove \r and < and > prefix/suffixes
-    chomp $msgid;
-    $msgid =~ s/^<//; $msgid =~ s/>.*$//g;
-    push(@msgid, $msgid);
-  }
-
-  # Use sha1_hex(Date:, last received: and top N bytes of body)
-  # where N is MIN(1024 bytes, 1/2 of body length)
-  #
-  my $date = $msg->get_header("Date");
-  $date = "None" if (!defined $date || $date eq ''); # No Date?
-
-  my @rcvd = $msg->get_header("Received");
-  my $rcvd = $rcvd[$#rcvd];
-  $rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?
-
-  # Make a copy since pristine_body is a reference ...
-  my $body = join('', $msg->get_pristine_body());
-  if (length($body) > 64) { # Small Body?
-    my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );
-    substr($body, $keep) = '';
-  }
-
-  unshift(@msgid, sha1_hex($date."\000".$rcvd."\000".$body).'@sa_generated');
-
-  return wantarray ? @msgid : $msgid[0];
-}
-
-sub get_body_from_msg {
-  my ($self, $msg) = @_;
-
-  if (!ref $msg) {
-    # I have no idea why this seems to happen. TODO
-    warn "bayes: msg not a ref: '$msg'";
-    return { };
-  }
-
-  my $permsgstatus =
-        Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg);
-  $msg->extract_message_metadata ($permsgstatus);
-  my $msgdata = $self->get_msgdata_from_permsgstatus ($permsgstatus);
-  $permsgstatus->finish();
-
-  if (!defined $msgdata) {
-    # why?!
-    warn "bayes: failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n";
-    return { };
-  }
-
-  return $msgdata;
-}
-
-sub get_msgdata_from_permsgstatus {
-  my ($self, $msg) = @_;
-
-  my $msgdata = { };
-  $msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array();
-  $msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array();
-  @{$msgdata->{bayes_token_uris}} = $msg->get_uri_list();
-  return $msgdata;
-}
-
-###########################################################################
-
 sub sync {
   my ($self, $sync, $expire, $opts) = @_;
-  if (!$self->{conf}->{use_bayes}) { return 0; }
+  return 0 unless $self->{conf}->{use_learner};
 
   if ($sync) {
-    dbg("bayes: bayes journal sync starting");
-    $self->{store}->sync($opts);
-    dbg("bayes: bayes journal sync completed");
+    $self->{main}->call_plugins("learner_sync", $opts );
   }
   if ($expire) {
-    dbg("bayes: expiry starting");
-    $self->{store}->expire_old_tokens($opts);
-    dbg("bayes: expiry completed");
+    $self->{main}->call_plugins("learner_expire_old_training", $opts );
   }
 
   return 0;
@@ -1019,408 +147,21 @@
 
 ###########################################################################
 
-# compute the probability that a token is spammish
-sub compute_prob_for_token {
-  my ($self, $token, $ns, $nn, $s, $n) = @_;
-
-  # we allow the caller to give us the token information, just
-  # to save a potentially expensive lookup
-  if (!defined($s) || !defined($n)) {
-    ($s, $n, undef) = $self->{store}->tok_get ($token);
-  }
-
-  return if ($s == 0 && $n == 0);
-
-  if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
-    return if ($s + $n < 10);      # ignore low-freq tokens
-  }
-
-  if (!$self->{use_hapaxes}) {
-    return if ($s + $n < 2);
-  }
-
-  return if ( $ns == 0 || $nn == 0 );
-
-  my $ratios = ($s / $ns);
-  my $ration = ($n / $nn);
-
-  my $prob;
-
-  if ($ratios == 0 && $ration == 0) {
-    warn "bayes: oops? ratios == ration == 0";
-    return;
-  } else {
-    $prob = ($ratios) / ($ration + $ratios);
-  }
-
-  if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
-    # use Robinson's f(x) equation for low-n tokens, instead of just
-    # ignoring them
-    my $robn = $s+$n;
-    $prob = ($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))
-                             /
-            ($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);
-  }
-
-  # 'log_raw_counts' is used to log the raw data for the Bayes equations during
-  # a mass-check, allowing the S and X constants to be optimized quickly
-  # without requiring re-tokenization of the messages for each attempt. There's
-  # really no need for this code to be uncommented in normal use, however.   It
-  # has never been publicly documented, so commenting it out is fine. ;)
-
-  ## if ($self->{log_raw_counts}) {
-  ## $self->{raw_counts} .= " s=$s,n=$n ";
-  ## }
-
-  return $prob;
-}
-
-###########################################################################
-# If a token is neither hammy nor spammy, return 0.
-# For a spammy token, return the minimum number of additional ham messages
-# it would have had to appear in to no longer be spammy.  Hammy tokens
-# are handled similarly.  That's what the function does (at the time
-# of this writing, 31 July 2003, 16:02:55 CDT).  It would be slightly
-# more useful if it returned the number of /additional/ ham messages
-# a spammy token would have to appear in to no longer be spammy but I
-# fear that might require the solution to a cubic equation, and I
-# just don't have the time for that now.
-
-sub compute_declassification_distance {
-  my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_;
-
-  return 0 if $ns == 0 && $nn == 0;
-
-  if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);}
-  if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);}
-
-  return 0 if $Ns == 0 || $Nn == 0;
-  return 0 if abs( $prob - 0.5 ) <
-                $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
-
-  my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn);
-  my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
-
-  return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na
-    unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS;
-
-  my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT;
-  my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X;
-  my $a = $Nb * ( 1 - $p );
-  my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb;
-  my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) );
-  my $discrim = $b * $b - 4 * $a * $c;
-  my $disc_max_0 = $discrim < 0 ? 0 : $discrim;
-  my $dd_exact = ( 1.0 - 1e-6 + ( -$b + sqrt( $disc_max_0 ) ) / ( 2*$a ) ) - $na;
-
-  # This shouldn't be necessary.  Should not be < 1
-  return $dd_exact < 1 ? 1 : int($dd_exact);
-}
-
-
-# Check to make sure we can tie() the DB, and we have enough entries to do a scan
-# if we're told the caller will untie(), go ahead and leave the db tied.
 sub is_scan_available {
   my $self = shift;
-
-  return 0 unless $self->{conf}->{use_bayes};
-  return 0 unless $self->{store}->tie_db_readonly();
-
-  # We need the DB to stay tied, so if the journal sync occurs, don't untie!
-  my $caller_untie = $self->{main}->{learn_caller_will_untie};
-  $self->{main}->{learn_caller_will_untie} = 1;
-
-  # Do a journal sync if necessary.  Do this before the nspam_nham_get()
-  # call since the sync may cause an update in the number of messages
-  # learnt.
-  $self->opportunistic_calls(1);
-
-  # Reset the variable appropriately
-  $self->{main}->{learn_caller_will_untie} = $caller_untie;
-
-  my ($ns, $nn) = $self->{store}->nspam_nham_get();
-
-  if ($ns < $self->{conf}->{bayes_min_spam_num}) {
-    dbg("bayes: not available for scanning, only $ns spam(s) in bayes DB < ".$self->{conf}->{bayes_min_spam_num});
-    if (!$self->{main}->{learn_caller_will_untie}) {
-      $self->{store}->untie_db();
-    }
-    return 0;
-  }
-  if ($nn < $self->{conf}->{bayes_min_ham_num}) {
-    dbg("bayes: not available for scanning, only $nn ham(s) in bayes DB < ".$self->{conf}->{bayes_min_ham_num});
-    if (!$self->{main}->{learn_caller_will_untie}) {
-      $self->{store}->untie_db();
-    }
-    return 0;
-  }
-
-  return 1;
+  return 0 unless $self->{conf}->{use_learner};
+  return $self->{main}->call_plugins("learner_is_scan_available");
 }
 
 ###########################################################################
-# Finally, the scoring function for testing mail.
 
-sub scan {
-  my ($self, $permsgstatus, $msg) = @_;
-  my $score;
-
-  # When we're doing a scan, we'll guarantee that we'll do the untie,
-  # so override the global setting until we're done.
-  my $caller_untie = $self->{main}->{learn_caller_will_untie};
-  $self->{main}->{learn_caller_will_untie} = 1;
-
-  goto skip if ($self->ignore_message($permsgstatus));
-
-  goto skip unless $self->is_scan_available();
-
-  my ($ns, $nn) = $self->{store}->nspam_nham_get();
-
-  ## if ($self->{log_raw_counts}) { # see compute_prob_for_token()
-  ## $self->{raw_counts} = " ns=$ns nn=$nn ";
-  ## }
-
-  dbg("bayes: corpus size: nspam = $ns, nham = $nn");
-
-  my $msgdata = $self->get_msgdata_from_permsgstatus ($permsgstatus);
-
-  my $msgtokens = $self->tokenize($msg, $msgdata);
-
-  my $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
-
-  my %pw;
-
-  foreach my $tokendata (@{$tokensdata}) {
-    my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
-    my $prob = $self->compute_prob_for_token($token, $ns, $nn, $tok_spam, $tok_ham);
-    next unless defined $prob;
-
-    $pw{$token} = {
-      prob => $prob,
-      spam_count => $tok_spam,
-      ham_count => $tok_ham,
-      atime => $atime
-    };
-  }
-
-  # If none of the tokens were found in the DB, we're going to skip
-  # this message...
-  if (!keys %pw) {
-    dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database");
-    goto skip;
-  }
-
-  my $tcount_total = keys %{$msgtokens};
-  my $tcount_learned = keys %pw;
-
-  # Figure out the message receive time (used as atime below)
-  # If the message atime comes back as being in the future, something's
-  # messed up and we should revert to current time as a safety measure.
-  #
-  my $msgatime = $msg->receive_date();
-  my $now = time;
-  $msgatime = $now if ( $msgatime > $now );
-
-  # now take the $count most significant tokens and calculate probs using
-  # Robinson's formula.
-  my $count = N_SIGNIFICANT_TOKENS;
-  my @sorted;
-
-  my @touch_tokens;
-  my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];
-  my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];
-
-  my %tok_strength = map { $_ => (abs($pw{$_}->{prob} - 0.5)) } keys %pw;
-  my $log_each_token = (would_log('dbg', 'bayes') > 1);
-
-  foreach my $tok (sort {
-              $tok_strength{$b} <=> $tok_strength{$a}
-            } keys %pw)
-  {
-    if ($count-- < 0) { last; }
-    next if ($tok_strength{$tok} <
-                $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH);
-
-    my $pw = $pw{$tok}->{prob};
-
-    # What's more expensive, scanning headers for HAMMYTOKENS and
-    # SPAMMYTOKENS tags that aren't there or collecting data that
-    # won't be used?  Just collecting the data is certainly simpler.
-    #
-    my $raw_token = $msgtokens->{$tok} || "(unknown)";
-    my $s = $pw{$tok}->{spam_count};
-    my $n = $pw{$tok}->{ham_count};
-    my $a = $pw{$tok}->{atime};
-
-    if ($pw < 0.5) {
-      push @$tinfo_hammy,  [$raw_token,$pw,$s,$n,$a];
-    } else {
-      push @$tinfo_spammy, [$raw_token,$pw,$s,$n,$a];
-    }
-
-    push (@sorted, $pw);
-
-    # update the atime on this token, it proved useful
-    push(@touch_tokens, $tok);
-
-    if ($log_each_token) {
-      dbg("bayes: token '$raw_token' => $pw");
-    }
-  }
-
-  if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 && 
-	$#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE))
-  {
-    dbg("bayes: cannot use bayes on this message; not enough usable tokens found");
-    goto skip;
-  }
-
-  $score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted);
-
-  # Couldn't come up with a probability?
-  goto skip unless defined $score;
-
-  dbg("bayes: score = $score");
-
-  # no need to call tok_touch_all unless there were significant
-  # tokens and a score was returned
-  # we don't really care about the return value here
-  $self->{store}->tok_touch_all(\@touch_tokens, $msgatime);
-
-  $permsgstatus->{bayes_nspam} = $ns;
-  $permsgstatus->{bayes_nham} = $nn;
-
-  ## if ($self->{log_raw_counts}) { # see compute_prob_for_token()
-  ## print "#Bayes-Raw-Counts: $self->{raw_counts}\n";
-  ## }
-
-  $self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens,
-					      probsref => \%pw,
-					      score => $score,
-					      msgatime => $msgatime,
-					      significant_tokens => \@touch_tokens,
-					    });
-
-skip:
-  if (!defined $score) {
-    dbg("bayes: not scoring message, returning undef");
-  }
-
-  # Take any opportunistic actions we can take
-  if ($self->{main}->{opportunistic_expire_check_only}) {
-    # we're supposed to report on expiry only -- so do the
-    # opportunistic_calls() run for the journal only.
-    $self->opportunistic_calls(1);
-    $permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due();
-  }
-  else {
-    $self->opportunistic_calls();
-  }
-
-  # Do any cleanup we need to do
-  $self->{store}->cleanup();
-
-  # Reset the value accordingly
-  $self->{main}->{learn_caller_will_untie} = $caller_untie;
-
-  # If our caller won't untie the db, we need to do it.
-  if (!$caller_untie) {
-    $self->{store}->untie_db();
-  }
-
-  $permsgstatus->{tag_data}{BAYESTCHAMMY} = 
-                        ($tinfo_hammy ? scalar @{$tinfo_hammy} : 0);
-  $permsgstatus->{tag_data}{BAYESTCSPAMMY} = 
-                        ($tinfo_spammy ? scalar @{$tinfo_spammy} : 0);
-  $permsgstatus->{tag_data}{BAYESTCLEARNED} = $tcount_learned;
-  $permsgstatus->{tag_data}{BAYESTC} = $tcount_total;
-
-  return $score;
-}
-
-sub opportunistic_calls {
-  my($self, $journal_only) = @_;
-
-  # If we're not already tied, abort.
-  if (!$self->{store}->db_readable()) {
-    dbg("bayes: opportunistic call attempt failed, DB not readable");
-    return;
-  }
-
-  # Is an expire or sync running?
-  my $running_expire = $self->{store}->get_running_expire_tok();
-  if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) {
-    dbg("bayes: opportunistic call attempt skipped, found fresh running expire magic token");
-    return;
-  }
-
-  # handle expiry and syncing
-  if (!$journal_only && $self->{store}->expiry_due()) {
-    dbg("bayes: opportunistic call found expiry due");
-
-    # sync will bring the DB R/W as necessary, and the expire will remove
-    # the running_expire token, may untie as well.
-    $self->sync(1,1);
-  }
-  elsif ( $self->{store}->sync_due() ) {
-    dbg("bayes: opportunistic call found journal sync due");
-
-    # sync will bring the DB R/W as necessary, may untie as well
-    $self->sync(1,0);
-
-    # We can only remove the running_expire token if we're doing R/W
-    if ($self->{store}->db_writable()) {
-      $self->{store}->remove_running_expire_tok();
-    }
-  }
-
-  return;
-}
-
-###########################################################################
-
 sub dump_bayes_db {
   my($self, $magic, $toks, $regex) = @_;
-
-  # allow dump to occur even if use_bayes disables everything else ...
-  #return 0 unless $self->{conf}->{use_bayes};
-  return 0 unless $self->{store}->tie_db_readonly();
-  
-  my @vars = $self->{store}->get_storage_variables();
-
-  my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars;
-
-  my $template = '%3.3f %10u %10u %10u  %s'."\n";
-
-  if ( $magic ) {
-    printf ($template, 0.0, 0, $bv, 0, 'non-token data: bayes db version');
-    printf ($template, 0.0, 0, $ns, 0, 'non-token data: nspam');
-    printf ($template, 0.0, 0, $nh, 0, 'non-token data: nham');
-    printf ($template, 0.0, 0, $nt, 0, 'non-token data: ntokens');
-    printf ($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime');
-    printf ($template, 0.0, 0, $na, 0, 'non-token data: newest atime') if ( $bv >= 2 );
-    printf ($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count') if ( $bv < 2 );
-    printf ($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime') if ( $bv >= 2 );
-    printf ($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime');
-    if ( $bv >= 2 ) {
-      printf ($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta');
-      printf ($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count');
-    }
-  }
-
-  if ( $toks ) {
-    # let the store sort out the db_toks
-    $self->{store}->dump_db_toks($template, $regex, @vars);
-  }
-
-  if (!$self->{main}->{learn_caller_will_untie}) {
-    $self->{store}->untie_db();
-  }
-  return 1;
+  return 0 unless $self->{conf}->{use_learner};
+  return $self->{main}->call_plugins("learner_dump_database", { 
+            magic => $magic, toks => $toks, regex => $regex });
 }
 
 1;
 
-=back
-
 =cut
Index: rules/23_bayes.cf
===================================================================
--- rules/23_bayes.cf	(.../trunk)	(revision 602913)
+++ rules/23_bayes.cf	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -53,14 +53,14 @@
 tflags BAYES_95		learn
 tflags BAYES_99		learn
 
-describe BAYES_00	Bayesian spam probability is 0 to 1%
-describe BAYES_05	Bayesian spam probability is 1 to 5%
-describe BAYES_20	Bayesian spam probability is 5 to 20%
-describe BAYES_40	Bayesian spam probability is 20 to 40%
-describe BAYES_50	Bayesian spam probability is 40 to 60%
-describe BAYES_60	Bayesian spam probability is 60 to 80%
-describe BAYES_80	Bayesian spam probability is 80 to 95%
-describe BAYES_95	Bayesian spam probability is 95 to 99%
-describe BAYES_99	Bayesian spam probability is 99 to 100%
+describe BAYES_00	Bayes spam probability is 0 to 1%
+describe BAYES_05	Bayes spam probability is 1 to 5%
+describe BAYES_20	Bayes spam probability is 5 to 20%
+describe BAYES_40	Bayes spam probability is 20 to 40%
+describe BAYES_50	Bayes spam probability is 40 to 60%
+describe BAYES_60	Bayes spam probability is 60 to 80%
+describe BAYES_80	Bayes spam probability is 80 to 95%
+describe BAYES_95	Bayes spam probability is 95 to 99%
+describe BAYES_99	Bayes spam probability is 99 to 100%
 
 endif
Index: t/bayessdbm.t
===================================================================
--- t/bayessdbm.t	(.../trunk)	(revision 602913)
+++ t/bayessdbm.t	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -34,8 +34,12 @@
 
 ok($sa);
 
-ok($sa->{bayes_scanner});
+sub getimpl {
+  return $sa->call_plugins("learner_get_implementation");
+}
 
+ok($sa->{bayes_scanner} && getimpl);
+
 ok(!$sa->{bayes_scanner}->is_scan_available());
 
 open(MAIL,"< data/spam/001");
@@ -52,49 +56,49 @@
 
 ok($mail);
 
-my $body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+my $body = getimpl->get_body_from_msg($mail);
 
 ok($body);
 
-my $toks = $sa->{bayes_scanner}->tokenize($mail, $body);
+my $toks = getimpl->tokenize($mail, $body);
 
 ok(scalar(keys %{$toks}) > 0);
 
-my($msgid,$msgid_hdr) = $sa->{bayes_scanner}->get_msgid($mail);
+my($msgid,$msgid_hdr) = getimpl->get_msgid($mail);
 
 # $msgid is the generated hash messageid
 # $msgid_hdr is the Message-Id header
 ok($msgid eq 'ce33e4a8bc5798c65428d6018380bae346c7c126@sa_generated');
 ok($msgid_hdr eq '9PS291LhupY');
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+ok(!getimpl->{store}->seen_get($msgid));
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 ok($sa->{bayes_scanner}->learn(1, $mail));
 
 ok(!$sa->{bayes_scanner}->learn(1, $mail));
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 's');
+ok(getimpl->{store}->seen_get($msgid) eq 's');
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
 my $tokerror = 0;
 foreach my $tok (keys %{$toks}) {
-  my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+  my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok);
   if ($spam == 0 || $ham > 0) {
     $tokerror = 1;
   }
 }
 ok(!$tokerror);
 
-my $tokens = $sa->{bayes_scanner}->{store}->tok_get_all(keys %{$toks});
+my $tokens = getimpl->{store}->tok_get_all(keys %{$toks});
 
 ok($tokens);
 
@@ -107,36 +111,36 @@
 }
 ok(!$tokerror);
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 ok($sa->{bayes_scanner}->learn(0, $mail));
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 'h');
+ok(getimpl->{store}->seen_get($msgid) eq 'h');
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
 $tokerror = 0;
 foreach my $tok (keys %{$toks}) {
-  my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+  my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok);
   if ($spam  > 0 || $ham == 0) {
     $tokerror = 1;
   }
 }
 ok(!$tokerror);
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 ok($sa->{bayes_scanner}->forget($mail));
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+ok(!getimpl->{store}->seen_get($msgid));
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 undef $sa;
 
@@ -217,13 +221,13 @@
 
 $mail = $sa->parse( $raw_message );
 
-$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+$body = getimpl->get_body_from_msg($mail);
 
 my $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
 
 ok($msgstatus);
 
-my $score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+my $score = getimpl->scan($msgstatus, $mail, $body);
 
 # Pretty much we can't count on the data returned with such little training
 # so just make sure that the score wasn't equal to .5 which is the default
@@ -242,11 +246,11 @@
 
 $mail = $sa->parse( $raw_message );
 
-$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+$body = getimpl->get_body_from_msg($mail);
 
 $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
 
-$score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+$score = getimpl->scan($msgstatus, $mail, $body);
 
 # Pretty much we can't count on the data returned with such little training
 # so just make sure that the score wasn't equal to .5 which is the default
@@ -256,7 +260,7 @@
 
 }
 
-ok($sa->{bayes_scanner}->{store}->clear_database());
+ok(getimpl->{store}->clear_database());
 
 ok(!-e 'log/user_state/bayes_journal');
 ok(!-e 'log/user_state/bayes_seen.pag');
Index: t/bayessql.t
===================================================================
--- t/bayessql.t	(.../trunk)	(revision 602913)
+++ t/bayessql.t	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -60,17 +60,21 @@
 
 ok($sa);
 
-ok($sa->{bayes_scanner});
+sub getimpl {
+  return $sa->call_plugins("learner_get_implementation");
+}
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok($sa->{bayes_scanner} && getimpl);
 
+ok(getimpl->{store}->tie_db_writable());
+
 # This bit breaks abstraction a bit, the userid is an implementation detail,
 # but is necessary to perform some of the tests.  Perhaps in the future we
 # can add some sort of official API for this sort of thing.
-my $testuserid = $sa->{bayes_scanner}->{store}->{_userid};
+my $testuserid = getimpl->{store}->{_userid};
 ok(defined($testuserid));
 
-ok($sa->{bayes_scanner}->{store}->clear_database());
+ok(getimpl->{store}->clear_database());
 
 ok(database_clear_p($testuser, $testuserid));
 
@@ -95,7 +99,7 @@
 
 ok($sa->{bayes_scanner});
 
-ok(!$sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(!getimpl->{store}->tie_db_writable());
 
 $sa->finish_learner();
 
@@ -138,49 +142,49 @@
 
 ok($mail);
 
-my $body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+my $body = getimpl->get_body_from_msg($mail);
 
 ok($body);
 
-my $toks = $sa->{bayes_scanner}->tokenize($mail, $body);
+my $toks = getimpl->tokenize($mail, $body);
 
 ok(scalar(keys %{$toks}) > 0);
 
-my($msgid,$msgid_hdr) = $sa->{bayes_scanner}->get_msgid($mail);
+my($msgid,$msgid_hdr) = getimpl->get_msgid($mail);
 
 # $msgid is the generated hash messageid
 # $msgid_hdr is the Message-Id header
 ok($msgid eq 'ce33e4a8bc5798c65428d6018380bae346c7c126@sa_generated');
 ok($msgid_hdr eq '9PS291LhupY');
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+ok(!getimpl->{store}->seen_get($msgid));
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 ok($sa->{bayes_scanner}->learn(1, $mail));
 
 ok(!$sa->{bayes_scanner}->learn(1, $mail));
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 's');
+ok(getimpl->{store}->seen_get($msgid) eq 's');
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
 my $tokerror = 0;
 foreach my $tok (keys %{$toks}) {
-  my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+  my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok);
   if ($spam == 0 || $ham > 0) {
     $tokerror = 1;
   }
 }
 ok(!$tokerror);
 
-my $tokens = $sa->{bayes_scanner}->{store}->tok_get_all(keys %{$toks});
+my $tokens = getimpl->{store}->tok_get_all(keys %{$toks});
 
 ok($tokens);
 
@@ -194,44 +198,44 @@
 
 ok(!$tokerror);
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 ok($sa->{bayes_scanner}->learn(0, $mail));
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 'h');
+ok(getimpl->{store}->seen_get($msgid) eq 'h');
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
 $tokerror = 0;
 foreach my $tok (keys %{$toks}) {
-  my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+  my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok);
   if ($spam  > 0 || $ham == 0) {
     $tokerror = 1;
   }
 }
 ok(!$tokerror);
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 ok($sa->{bayes_scanner}->forget($mail));
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+ok(!getimpl->{store}->seen_get($msgid));
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 # This bit breaks abstraction a bit, the userid is an implementation detail,
 # but is necessary to perform some of the tests.  Perhaps in the future we
 # can add some sort of official API for this sort of thing.
-$testuserid = $sa->{bayes_scanner}->{store}->{_userid};
+$testuserid = getimpl->{store}->{_userid};
 ok(defined($testuserid));
 
-ok($sa->{bayes_scanner}->{store}->clear_database());
+ok(getimpl->{store}->clear_database());
 
 ok(database_clear_p($testuser, $testuserid));
 
@@ -293,13 +297,13 @@
 
 $mail = $sa->parse( \@msg );
 
-$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+$body = getimpl->get_body_from_msg($mail);
 
 my $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
 
 ok($msgstatus);
 
-my $score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+my $score = getimpl->scan($msgstatus, $mail, $body);
 
 # Pretty much we can't count on the data returned with such little training
 # so just make sure that the score wasn't equal to .5 which is the default
@@ -324,11 +328,11 @@
 
 $mail = $sa->parse( \@msg );
 
-$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+$body = getimpl->get_body_from_msg($mail);
 
 $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
 
-$score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+$score = getimpl->scan($msgstatus, $mail, $body);
 
 # Pretty much we can't count on the data returned with such little training
 # so just make sure that the score wasn't equal to .5 which is the default
@@ -340,10 +344,10 @@
 # This bit breaks abstraction a bit, the userid is an implementation detail,
 # but is necessary to perform some of the tests.  Perhaps in the future we
 # can add some sort of official API for this sort of thing.
-$testuserid = $sa->{bayes_scanner}->{store}->{_userid};
+$testuserid = getimpl->{store}->{_userid};
 ok(defined($testuserid));
 
-ok($sa->{bayes_scanner}->{store}->clear_database());
+ok(getimpl->{store}->clear_database());
 
 ok(database_clear_p($testuser, $testuserid));
 
Index: t/bayesdbm.t
===================================================================
--- t/bayesdbm.t	(.../trunk)	(revision 602913)
+++ t/bayesdbm.t	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -31,9 +31,12 @@
 
 $sa->init();
 
+sub getimpl {
+  return $sa->call_plugins("learner_get_implementation");
+}
 ok($sa);
 
-ok($sa->{bayes_scanner});
+ok ($sa->{bayes_scanner} && getimpl);
 
 ok(!$sa->{bayes_scanner}->is_scan_available());
 
@@ -51,15 +54,15 @@
 
 ok($mail);
 
-my $body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+my $body = getimpl->get_body_from_msg($mail);
 
 ok($body);
 
-my $toks = $sa->{bayes_scanner}->tokenize($mail, $body);
+my $toks = getimpl->tokenize($mail, $body);
 
 ok(scalar(keys %{$toks}) > 0);
 
-my($msgid,$msgid_hdr) = $sa->{bayes_scanner}->get_msgid($mail);
+my($msgid,$msgid_hdr) = getimpl->get_msgid($mail);
 
 # $msgid is the generated hash messageid
 # $msgid_hdr is the Message-Id header
@@ -67,34 +70,34 @@
     or warn "got: [$msgid]";
 ok($msgid_hdr eq '9PS291LhupY');
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+ok(!getimpl->{store}->seen_get($msgid));
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 ok($sa->{bayes_scanner}->learn(1, $mail));
 
 ok(!$sa->{bayes_scanner}->learn(1, $mail));
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 's');
+ok(getimpl->{store}->seen_get($msgid) eq 's');
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
 my $tokerror = 0;
 foreach my $tok (keys %{$toks}) {
-  my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+  my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok);
   if ($spam == 0 || $ham > 0) {
     $tokerror = 1;
   }
 }
 ok(!$tokerror);
 
-my $tokens = $sa->{bayes_scanner}->{store}->tok_get_all(keys %{$toks});
+my $tokens = getimpl->{store}->tok_get_all(keys %{$toks});
 
 ok($tokens);
 
@@ -107,36 +110,36 @@
 }
 ok(!$tokerror);
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 ok($sa->{bayes_scanner}->learn(0, $mail));
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 'h');
+ok(getimpl->{store}->seen_get($msgid) eq 'h');
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
 $tokerror = 0;
 foreach my $tok (keys %{$toks}) {
-  my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+  my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok);
   if ($spam  > 0 || $ham == 0) {
     $tokerror = 1;
   }
 }
 ok(!$tokerror);
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 ok($sa->{bayes_scanner}->forget($mail));
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+ok(!getimpl->{store}->seen_get($msgid));
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 undef $sa;
 
@@ -213,13 +216,13 @@
 
 $mail = $sa->parse( $raw_message );
 
-$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+$body = getimpl->get_body_from_msg($mail);
 
 my $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
 
 ok($msgstatus);
 
-my $score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+my $score = getimpl->scan($msgstatus, $mail, $body);
 
 # Pretty much we can't count on the data returned with such little training
 # so just make sure that the score wasn't equal to .5 which is the default
@@ -238,11 +241,11 @@
 
 $mail = $sa->parse( $raw_message );
 
-$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+$body = getimpl->get_body_from_msg($mail);
 
 $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
 
-$score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+$score = getimpl->scan($msgstatus, $mail, $body);
 
 # Pretty much we can't count on the data returned with such little training
 # so just make sure that the score wasn't equal to .5 which is the default
@@ -252,7 +255,7 @@
 
 }
 
-ok($sa->{bayes_scanner}->{store}->clear_database());
+ok(getimpl->{store}->clear_database());
 
 ok(!-e 'log/user_state/bayes_journal');
 ok(!-e 'log/user_state/bayes_seen');
Index: t/bayesdbm_flock.t
===================================================================
--- t/bayesdbm_flock.t	(.../trunk)	(revision 602913)
+++ t/bayesdbm_flock.t	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -34,8 +34,12 @@
 
 ok($sa);
 
-ok($sa->{bayes_scanner});
+sub getimpl {
+  return $sa->call_plugins("learner_get_implementation");
+}
 
+ok(getimpl && $sa->{bayes_scanner});
+
 ok(!$sa->{bayes_scanner}->is_scan_available());
 
 open(MAIL,"< data/spam/001");
@@ -52,49 +56,49 @@
 
 ok($mail);
 
-my $body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+my $body = getimpl->get_body_from_msg($mail);
 
 ok($body);
 
-my $toks = $sa->{bayes_scanner}->tokenize($mail, $body);
+my $toks = getimpl->tokenize($mail, $body);
 
 ok(scalar(keys %{$toks}) > 0);
 
-my($msgid,$msgid_hdr) = $sa->{bayes_scanner}->get_msgid($mail);
+my($msgid,$msgid_hdr) = getimpl->get_msgid($mail);
 
 # $msgid is the generated hash messageid
 # $msgid_hdr is the Message-Id header
 ok($msgid eq 'ce33e4a8bc5798c65428d6018380bae346c7c126@sa_generated');
 ok($msgid_hdr eq '9PS291LhupY');
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+ok(!getimpl->{store}->seen_get($msgid));
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 ok($sa->{bayes_scanner}->learn(1, $mail));
 
 ok(!$sa->{bayes_scanner}->learn(1, $mail));
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 's');
+ok(getimpl->{store}->seen_get($msgid) eq 's');
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
 my $tokerror = 0;
 foreach my $tok (keys %{$toks}) {
-  my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+  my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok);
   if ($spam == 0 || $ham > 0) {
     $tokerror = 1;
   }
 }
 ok(!$tokerror);
 
-my $tokens = $sa->{bayes_scanner}->{store}->tok_get_all(keys %{$toks});
+my $tokens = getimpl->{store}->tok_get_all(keys %{$toks});
 
 ok($tokens);
 
@@ -107,36 +111,36 @@
 }
 ok(!$tokerror);
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 ok($sa->{bayes_scanner}->learn(0, $mail));
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 'h');
+ok(getimpl->{store}->seen_get($msgid) eq 'h');
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
 $tokerror = 0;
 foreach my $tok (keys %{$toks}) {
-  my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+  my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok);
   if ($spam  > 0 || $ham == 0) {
     $tokerror = 1;
   }
 }
 ok(!$tokerror);
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 ok($sa->{bayes_scanner}->forget($mail));
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+ok(!getimpl->{store}->seen_get($msgid));
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 undef $sa;
 
@@ -213,13 +217,13 @@
 
 $mail = $sa->parse( $raw_message );
 
-$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+$body = getimpl->get_body_from_msg($mail);
 
 my $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
 
 ok($msgstatus);
 
-my $score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+my $score = getimpl->scan($msgstatus, $mail, $body);
 
 # Pretty much we can't count on the data returned with such little training
 # so just make sure that the score wasn't equal to .5 which is the default
@@ -238,11 +242,11 @@
 
 $mail = $sa->parse( $raw_message );
 
-$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+$body = getimpl->get_body_from_msg($mail);
 
 $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
 
-$score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+$score = getimpl->scan($msgstatus, $mail, $body);
 
 # Pretty much we can't count on the data returned with such little training
 # so just make sure that the score wasn't equal to .5 which is the default
@@ -252,7 +256,7 @@
 
 }
 
-ok($sa->{bayes_scanner}->{store}->clear_database());
+ok(getimpl->{store}->clear_database());
 
 ok(!-e 'log/user_state/bayes_journal');
 ok(!-e 'log/user_state/bayes_seen');
Index: t/bayessdbm_seen_delete.t
===================================================================
--- t/bayessdbm_seen_delete.t	(.../trunk)	(revision 602913)
+++ t/bayessdbm_seen_delete.t	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -34,8 +34,12 @@
 
 ok($sa);
 
-ok($sa->{bayes_scanner});
+sub getimpl {
+  return $sa->call_plugins("learner_get_implementation");
+}
 
+ok($sa->{bayes_scanner} && getimpl);
+
 ok(!$sa->{bayes_scanner}->is_scan_available());
 
 open(MAIL,"< data/spam/001");
@@ -52,49 +56,49 @@
 
 ok($mail);
 
-my $body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+my $body = getimpl->get_body_from_msg($mail);
 
 ok($body);
 
-my $toks = $sa->{bayes_scanner}->tokenize($mail, $body);
+my $toks = getimpl->tokenize($mail, $body);
 
 ok(scalar(keys %{$toks}) > 0);
 
-my($msgid,$msgid_hdr) = $sa->{bayes_scanner}->get_msgid($mail);
+my($msgid,$msgid_hdr) = getimpl->get_msgid($mail);
 
 # $msgid is the generated hash messageid
 # $msgid_hdr is the Message-Id header
 ok($msgid eq 'ce33e4a8bc5798c65428d6018380bae346c7c126@sa_generated');
 ok($msgid_hdr eq '9PS291LhupY');
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+ok(!getimpl->{store}->seen_get($msgid));
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 ok($sa->{bayes_scanner}->learn(1, $mail));
 
 ok(!$sa->{bayes_scanner}->learn(1, $mail));
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 's');
+ok(getimpl->{store}->seen_get($msgid) eq 's');
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
 my $tokerror = 0;
 foreach my $tok (keys %{$toks}) {
-  my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+  my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok);
   if ($spam == 0 || $ham > 0) {
     $tokerror = 1;
   }
 }
 ok(!$tokerror);
 
-my $tokens = $sa->{bayes_scanner}->{store}->tok_get_all(keys %{$toks});
+my $tokens = getimpl->{store}->tok_get_all(keys %{$toks});
 
 ok($tokens);
 
@@ -107,36 +111,36 @@
 }
 ok(!$tokerror);
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 ok($sa->{bayes_scanner}->learn(0, $mail));
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok($sa->{bayes_scanner}->{store}->seen_get($msgid) eq 'h');
+ok(getimpl->{store}->seen_get($msgid) eq 'h');
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
 $tokerror = 0;
 foreach my $tok (keys %{$toks}) {
-  my ($spam, $ham, $atime) = $sa->{bayes_scanner}->{store}->tok_get($tok);
+  my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok);
   if ($spam  > 0 || $ham == 0) {
     $tokerror = 1;
   }
 }
 ok(!$tokerror);
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 ok($sa->{bayes_scanner}->forget($mail));
 
-ok($sa->{bayes_scanner}->{store}->tie_db_writable());
+ok(getimpl->{store}->tie_db_writable());
 
-ok(!$sa->{bayes_scanner}->{store}->seen_get($msgid));
+ok(!getimpl->{store}->seen_get($msgid));
 
-$sa->{bayes_scanner}->{store}->untie_db();
+getimpl->{store}->untie_db();
 
 undef $sa;
 
@@ -221,13 +225,13 @@
 
 $mail = $sa->parse( $raw_message );
 
-$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+$body = getimpl->get_body_from_msg($mail);
 
 my $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
 
 ok($msgstatus);
 
-my $score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+my $score = getimpl->scan($msgstatus, $mail, $body);
 
 # Pretty much we can't count on the data returned with such little training
 # so just make sure that the score wasn't equal to .5 which is the default
@@ -246,11 +250,11 @@
 
 $mail = $sa->parse( $raw_message );
 
-$body = $sa->{bayes_scanner}->get_body_from_msg($mail);
+$body = getimpl->get_body_from_msg($mail);
 
 $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
 
-$score = $sa->{bayes_scanner}->scan($msgstatus, $mail, $body);
+$score = getimpl->scan($msgstatus, $mail, $body);
 
 # Pretty much we can't count on the data returned with such little training
 # so just make sure that the score wasn't equal to .5 which is the default
@@ -260,7 +264,7 @@
 
 }
 
-ok($sa->{bayes_scanner}->{store}->clear_database());
+ok(getimpl->{store}->clear_database());
 
 ok(!-e 'log/user_state/bayes_journal');
 ok(!-e 'log/user_state/bayes_seen.pag');
Index: lib/Mail/SpamAssassin/OSBF/Store.pm
===================================================================
--- lib/Mail/SpamAssassin/OSBF/Store.pm	(.../trunk)	(revision 0)
+++ lib/Mail/SpamAssassin/OSBF/Store.pm	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -0,0 +1,871 @@
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+
+=head1 NAME
+
+Mail::SpamAssassin::OSBF::Store - OSBFian Storage Module
+
+=head1 DESCRIPTION
+
+This is the public API for the OSBFian store methods.  Any implementation of
+the storage module must implement these methods.
+
+=cut
+
+package Mail::SpamAssassin::OSBF::Store;
+
+use strict;
+use warnings;
+use bytes;
+use re 'taint';
+use Mail::SpamAssassin::Logger;
+
+# TODO: if we ever get tuits, it'd be good to make these POD
+# method docs more perlish... hardly a biggie.
+
+=head1 METHODS
+
+=over 4
+
+=item new
+
+public class (Mail::SpamAssassin::OSBF::Store) new (Mail::SpamAssassin::Plugin::OSBF $bayes)
+
+Description:
+This method creates a new instance of the
+Mail::SpamAssassin::OSBF::Store object.  You must pass in an instance of the
+Mail::SpamAssassin::Plugin::OSBF object, which is stashed for use throughout
+the module.
+
+=cut
+
+sub new {
+  my ($class, $osbf) = @_;
+
+  $class = ref($class) || $class;
+
+  my $self = {
+	      'osbf'                => $osbf,
+	      'supported_db_version' => 0,
+	      'db_version'	     => undef,
+	     };
+
+  bless ($self, $class);
+
+  $self;
+}
+
+=item DB_VERSION
+
+public instance (Integer) DB_VERSION ()
+
+Description:
+This method returns the currently supported database version for the
+implementation.
+
+=cut
+
+sub DB_VERSION {
+  my ($self) = @_;
+  return $self->{supported_db_version};
+}
+
+=item read_db_configs
+
+public instance () read_db_configs ()
+
+Description:
+This method reads any needed config variables from the configuration
+object and then calls the Mail::SpamAssassin::Plugin::OSBF read_db_configs method.
+
+=cut
+
+sub read_db_configs {
+  my ($self) = @_;
+
+  # TODO: at some stage, this may be useful to read config items which
+  # control database bloat, like
+  #
+  # - use of hapaxes
+  # - use of case-sensitivity
+  # - more midrange-hapax-avoidance tactics when parsing headers (future)
+  #
+  # for now, we just set these settings statically.
+  my $conf = $self->{osbf}->{main}->{conf};
+
+  # Minimum desired database size?  Expiry will not shrink the
+  # database below this number of entries.  100k entries is roughly
+  # equivalent to a 5Mb database file.
+  $self->{expiry_max_db_size} = $conf->{bayes_expiry_max_db_size};
+  $self->{expiry_pct} = $conf->{bayes_expiry_pct};
+  $self->{expiry_period} = $conf->{bayes_expiry_period};
+  $self->{expiry_max_exponent} = $conf->{bayes_expiry_max_exponent};
+}
+
+=item tie_db_readonly
+
+public instance (Boolean) tie_db_readonly ()
+
+Description:
+This method opens up the database in readonly mode.
+
+=cut
+
+sub tie_db_readonly {
+  my ($self) = @_;
+  die "osbf: tie_db_readonly: not implemented\n";
+}
+
+=item tie_db_writable
+
+public instance (Boolean) tie_db_writable ()
+
+Description:
+This method opens up the database in writable mode.
+
+Any callers of this methods should ensure that they call untie_db()
+afterwards.
+
+=cut
+
+sub tie_db_writable {
+  my ($self) = @_;
+  die "osbf: tie_db_writable: not implemented\n";
+}
+
+=item untie_db
+
+public instance () untie_db ()
+
+Description:
+This method unties the database.
+
+=cut
+
+sub untie_db {
+  my $self = shift;
+  die "osbf: untie_db: not implemented\n";
+}
+
+=item calculate_expire_delta
+
+public instance (%) calculate_expire_delta (Integer $newest_atime,
+                                             Integer $start,
+                                             Integer $max_expire_mult)
+
+Description:
+This method performs a calculation on the data to determine the optimum
+atime for token expiration.
+
+=cut
+
+sub calculate_expire_delta {
+  my ($self, $newest_atime, $start, $max_expire_mult) = @_;
+  die "osbf: calculate_expire_delta: not implemented\n";
+}
+
+=item token_expiration
+
+public instance (Integer, Integer,
+                 Integer, Integer) token_expiration(\% $opts,
+                                                    Integer $newest_atime,
+                                                    Integer $newdelta)
+
+Description:
+This method performs the database specific expiration of tokens based on
+the passed in C<$newest_atime> and C<$newdelta>.
+
+=cut
+
+sub token_expiration {
+  my ($self, $opts, $newest_atime, $newdelta) = @_;
+  die "osbf: token_expiration: not implemented\n";
+}
+
+=item expire_old_tokens
+
+public instance (Boolean) expire_old_tokens (\% hashref)
+
+Description:
+This method expires old tokens from the database.
+
+=cut
+
+sub expire_old_tokens {
+  my ($self, $opts) = @_;
+  my $ret;
+
+  my $eval_stat;
+  eval {
+    local $SIG{'__DIE__'};	# do not run user die() traps in here
+    if ($self->tie_db_writable()) {
+      $ret = $self->expire_old_tokens_trapped ($opts);
+    }
+    1;
+  } or do {
+    $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat;
+  };
+
+  if (!$self->{osbf}->{main}->{learn_caller_will_untie}) {
+    $self->untie_db();
+  }
+
+  if (defined $eval_stat) {	# if we died, untie the dbs.
+    warn "osbf: expire_old_tokens: $eval_stat\n";
+    return 0;
+  }
+  $ret;
+}
+
+=item expire_old_tokens_trapped
+
+public instance (Boolean) expire_old_tokens_trapped (\% $opts)
+
+Description:
+This methods does the actual token expiration.
+
+XXX More docs here about the methodology and what not
+
+=cut
+
+sub expire_old_tokens_trapped {
+  my ($self, $opts) = @_;
+
+  # Flag that we're doing work
+  $self->set_running_expire_tok();
+
+  # We don't need to do an expire, so why were we called?  Oh well.
+  if (!$self->expiry_due()) {
+    $self->remove_running_expire_tok();
+    return 0;
+  }
+
+  my $started = time();
+  my @vars = $self->get_storage_variables();
+
+  if ( $vars[10] > time ) {
+    dbg("osbf: expiry found newest atime in the future, resetting to current time");
+    $vars[10] = time;
+  }
+
+  # How many tokens do we want to keep?
+  my $goal_reduction = int($self->{expiry_max_db_size} * $self->{expiry_pct});
+  dbg("osbf: expiry check keep size, ".$self->{expiry_pct}." * max: $goal_reduction");
+  # Make sure we keep at least 100000 tokens in the DB
+  if ( $goal_reduction < 100000 ) {
+    $goal_reduction = 100000;
+    dbg("osbf: expiry keep size too small, resetting to 100,000 tokens");
+  }
+  # Now turn goal_reduction into how many to expire.
+  $goal_reduction = $vars[3] - $goal_reduction;
+  dbg("osbf: token count: ".$vars[3].", final goal reduction size: $goal_reduction");
+
+  if ( $goal_reduction < 1000 ) { # too few tokens to expire, abort.
+    dbg("osbf: reduction goal of $goal_reduction is under 1,000 tokens, skipping expire");
+    $self->set_last_expire(time());
+    $self->remove_running_expire_tok(); # this won't be cleaned up, so do it now.
+    return 1; # we want to indicate things ran as expected
+  }
+
+  # Estimate new atime delta based on the last atime delta
+  my $newdelta = 0;
+  if ( $vars[9] > 0 ) {
+    # newdelta = olddelta * old / goal;
+    # this may seem backwards, but since we're talking delta here,
+    # not actual atime, we want smaller atimes to expire more tokens,
+    # and visa versa.
+    #
+    $newdelta = int($vars[8] * $vars[9] / $goal_reduction);
+  }
+
+  # Calculate size difference between last expiration token removal
+  # count and the current goal removal count.
+  my $ratio = ($vars[9] == 0 || $vars[9] > $goal_reduction) ? $vars[9]/$goal_reduction : $goal_reduction/$vars[9];
+
+  dbg("osbf: first pass?  current: ".time().", Last: ".$vars[4].", atime: ".$vars[8].", count: ".$vars[9].", newdelta: $newdelta, ratio: $ratio, period: ".$self->{expiry_period});
+
+  ## ESTIMATION PHASE
+  #
+  # Do this for the first expire or "odd" looking results cause a first pass to determine atime:
+  #
+  # - last expire was more than 30 days ago
+  #   assume mail flow stays roughly the same month to month, recompute if it's > 1 month
+  # - last atime delta was under expiry period
+  #   if we're expiring often max_db_size should go up, but let's recompute just to check
+  # - last reduction count was < 1000 tokens
+  #   ditto
+  # - new estimated atime delta is under expiry period
+  #   ditto
+  # - difference of last reduction to current goal reduction is > 50%
+  #   if the two values are out of balance, estimating atime is going to be funky, recompute
+  #
+  if ( (time() - $vars[4] > 86400*30) || ($vars[8] < $self->{expiry_period}) || ($vars[9] < 1000)
+       || ($newdelta < $self->{expiry_period}) || ($ratio > 1.5) ) {
+    dbg("osbf: can't use estimation method for expiry, unexpected result, calculating optimal atime delta (first pass)");
+
+    my $start = $self->{expiry_period}; # exponential search starting at ...?  1/2 day, 1, 2, 4, 8, 16, ...
+    my $max_expire_mult = 2**$self->{expiry_max_exponent}; # $max_expire_mult * $start = max expire time (256 days), power of 2.
+
+    dbg("osbf: expiry max exponent: ".$self->{expiry_max_exponent});
+
+    my %delta = $self->calculate_expire_delta($vars[10], $start, $max_expire_mult);
+
+    return 0 unless (%delta);
+
+    # This will skip the for loop if debugging isn't enabled ...
+    if (would_log('dbg', 'bayes')) {
+      dbg("osbf: atime\ttoken reduction");
+      dbg("osbf: ========\t===============");
+      for(my $i = 1; $i<=$max_expire_mult; $i <<= 1) {
+	dbg("osbf: ".$start*$i."\t".(exists $delta{$i} ? $delta{$i} : 0));
+      }
+    }
+  
+    # Now figure out which max_expire_mult value gives the closest results to goal_reduction, without
+    # going over ...  Go from the largest delta backwards so the reduction size increases
+    # (tokens that expire at 4 also expire at 3, 2, and 1, so 1 will always be the largest expiry...)
+    #
+    for( ; $max_expire_mult > 0; $max_expire_mult>>=1 ) {
+      next unless exists $delta{$max_expire_mult};
+      if ($delta{$max_expire_mult} > $goal_reduction) {
+        $max_expire_mult<<=1; # the max expire is actually the next power of 2 out
+        last;
+      }
+    }
+
+    # if max_expire_mult gets to 0, either we can't expire anything, or 1 is <= $goal_reduction
+    $max_expire_mult ||= 1;
+
+    # $max_expire_mult is now equal to the value we should use ...
+    # Check to see if the atime value we found is really good.
+    # It's not good if:
+    # - $max_expire_mult would not expire any tokens.  This means that the majority of
+    #   tokens are old or new, and more activity is required before an expiry can occur.
+    # - reduction count < 1000, not enough tokens to be worth doing an expire.
+    #
+    if ( !exists $delta{$max_expire_mult} || $delta{$max_expire_mult} < 1000 ) {
+      dbg("osbf: couldn't find a good delta atime, need more token difference, skipping expire");
+      $self->set_last_expire(time());
+      $self->remove_running_expire_tok(); # this won't be cleaned up, so do it now.
+      return 1; # we want to indicate things ran as expected
+    }
+
+    $newdelta = $start * $max_expire_mult;
+    dbg("osbf: first pass decided on $newdelta for atime delta");
+  }
+  else { # use the estimation method
+    dbg("osbf: can do estimation method for expiry, skipping first pass");
+  }
+
+  my ($kept, $deleted, $num_hapaxes, $num_lowfreq) = $self->token_expiration($opts, $newdelta, @vars);
+
+  my $done = time();
+
+  my $msg = "expired old bayes database entries in ".($done - $started)." seconds";
+  my $msg2 = "$kept entries kept, $deleted deleted";
+
+  if ($opts->{verbose}) {
+    my $hapax_pc = ($num_hapaxes * 100) / $kept;
+    my $lowfreq_pc = ($num_lowfreq * 100) / $kept;
+    print "$msg\n$msg2\n";
+    printf "token frequency: 1-occurrence tokens: %3.2f%%\n", $hapax_pc;
+    printf "token frequency: less than 8 occurrences: %3.2f%%\n", $lowfreq_pc;
+  }
+  else {
+    dbg("osbf: $msg: $msg2");
+  }
+
+  return 1;
+}
+
+=item sync_due
+
+public instance (Boolean) sync_due ()
+
+Description:
+This methods determines if a sync is due.
+
+=cut
+
+sub sync_due {
+  my ($self) = @_;
+  die "osbf: sync_due: not implemented\n";
+}
+
+=item expiry_due
+
+public instance (Boolean) expiry_due ()
+
+Description:
+This methods determines if an expire is due.
+
+=cut
+
+sub expiry_due {
+  my ($self) = @_;
+
+  $self->read_db_configs();	# make sure this has happened here
+
+  # If force expire was called, do the expire no matter what.
+  return 1 if ($self->{osbf}->{main}->{learn_force_expire});
+
+  # if config says not to auto expire then no need to continue
+  return 0 if ($self->{osbf}->{main}->{conf}->{bayes_auto_expire} == 0);
+
+  # is the database too small for expiry?  (Do *not* use "scalar keys",
+  # as this will iterate through the entire db counting them!)
+  my @vars = $self->get_storage_variables();
+  my $ntoks = $vars[3];
+
+  my $last_expire = time() - $vars[4];
+  if (!$self->{osbf}->{main}->{ignore_safety_expire_timeout}) {
+    # if we're not ignoring the safety timeout, don't run an expire more
+    # than once every 12 hours.
+    return 0 if ($last_expire < 43200);
+  }
+  else {
+    # if we are ignoring the safety timeout (e.g.: mass-check), still
+    # limit the expiry to only one every 5 minutes.
+    return 0 if ($last_expire < 300);
+  }
+
+  dbg("osbf: DB expiry: tokens in DB: $ntoks, Expiry max size: ".$self->{expiry_max_db_size}.", Oldest atime: ".$vars[5].", Newest atime: ".$vars[10].", Last expire: ".$vars[4].", Current time: ".time(),'bayes','-1');
+
+  my $conf = $self->{osbf}->{main}->{conf};
+  if ($ntoks <= 100000 ||			# keep at least 100k tokens
+      $self->{expiry_max_db_size} > $ntoks ||	# not enough tokens to cause an expire
+      $vars[10]-$vars[5] < 43200 ||		# delta between oldest and newest < 12h
+      $self->{db_version} < $self->DB_VERSION # ignore old db formats
+      ) {
+    return 0;
+  }
+
+  return 1;
+}
+
+=item seen_get
+
+public instance (Char) seen_get (String $msgid)
+
+Description:
+This method retrieves the stored value, if any, for C<$msgid>.  The return
+value is the stored string ('s' for spam and 'h' for ham) or undef if
+C<$msgid> is not found.  Plus additional values (TODO)
+
+=cut
+
+sub seen_get {
+  my ($self, $msgid) = @_;
+  die "osbf: seen_get: not implemented\n";
+}
+
+=item seen_put
+
+public instance (Boolean) seen_put (String $msgid, Char $flag)
+
+Description:
+This method records C<$msgid> as the type given by C<$flag>.  C<$flag> is
+one of two values 's' for spam and 'h' for ham, plus additional
+values (TODO).
+
+=cut
+
+sub seen_put {
+  my ($self, $msgid, $flag) = @_;
+  die "osbf: seen_put: not implemented\n";
+}
+
+=item seen_delete
+
+public instance (Boolean) seen_delete (String $msgid)
+
+Description:
+This method removes C<$msgid> from storage.
+
+=cut
+
+sub seen_delete {
+  my ($self, $msgid) = @_;
+  die "osbf: seen_delete: not implemented\n";
+}
+
+=item get_storage_variables
+
+public instance (@) get_storage_variables ()
+
+Description:
+This method retrieves the various administrative variables used by
+the OSBF storage implementation.
+
+The values returned in the array are in the following order:
+
+0: scan count base
+
+1: number of spam
+
+2: number of ham
+
+3: number of tokens in db
+
+4: last expire atime
+
+5: oldest token in db atime
+
+6: db version value
+
+7: last journal sync
+
+8: last atime delta
+
+9: last expire reduction count
+
+10: newest token in db atime
+
+=cut
+
+sub get_storage_variables {
+  my ($self) = @_;
+  die "osbf: get_storage_variables: not implemented\n";
+}
+
+=item dump_db_toks
+
+public instance () dump_db_toks (String $template, String $regex, @ @vars)
+
+Description:
+This method loops over all tokens, computing the probability for the token
+and then printing it out according to the passed in template.
+
+=cut
+
+sub dump_db_toks {
+  my ($self, $template, $regex, @vars) = @_;
+  die "osbf: dump_db_toks: not implemented\n";
+}
+
+=item set_last_expire
+
+public instance (Boolean) _set_last_expire (Integer $time)
+
+Description:
+This method sets the last expire time.
+
+=cut
+
+sub set_last_expire {
+  my ($self, $time) = @_;
+  die "osbf: set_last_expire: not implemented\n";
+}
+
+=item get_running_expire_tok
+
+public instance (Time) get_running_expire_tok ()
+
+Description:
+This method determines if an expire is currently running and returns the time
+the expire started.
+
+=cut
+
+sub get_running_expire_tok {
+  my ($self) = @_;
+  die "osbf: get_running_expire_tok: not implemented\n";
+}
+
+=item set_running_expire_tok
+
+public instance (Time) set_running_expire_tok ()
+
+Description:
+This method sets the running expire time to the current time.
+
+=cut
+
+sub set_running_expire_tok {
+  my ($self) = @_;
+  die "osbf: set_running_expire_tok: not implemented\n";
+}
+
+=item remove_running_expire_tok
+
+public instance (Boolean) remove_running_expire_tok ()
+
+Description:
+This method removes a currently set running expire time.
+
+=cut
+
+sub remove_running_expire_tok {
+  my ($self) = @_;
+  die "osbf: remove_running_expire_tok: not implemented\n";
+}
+
+=item tok_get
+
+public instance (Integer, Integer, Time) tok_get (String $token)
+
+Description:
+This method retrieves the specified token (C<$token>) from storage and returns
+it's spam count, ham acount and last access time.
+
+=cut
+
+sub tok_get {
+  my ($self, $token) = @_;
+  die "osbf: tok_get: not implemented\n";
+}
+
+=item tok_get_all
+
+public instance (\@) tok_get_all (@ @tokens)
+
+Description:
+This method retrieves the specified tokens (C<@tokens>) from storage and
+returns an array ref of arrays spam count, ham count and last access time.
+
+=cut
+
+sub tok_get_all {
+  my ($self, $tokens) = @_;
+  die "osbf: tok_get_all: not implemented\n";
+}
+
+=item multi_tok_value_change
+
+public instance (Boolean) multi_tok_value_change (\% $weights,
+					          String $atime)
+
+Description:
+This method takes a C<$weights> hash ref and writes the associated
+[ spam, ham ] value pairs to the store, along with updating each tokens
+atime with C<$atime>.
+
+=cut
+
+sub multi_tok_value_change {
+  my ($self, $weights, $atime) = @_;
+  die "osbf: multi_tok_value_change: not implemented\n";
+}
+
+=item nspam_nham_get
+
+public instance (Integer, Integer) nspam_nham_get ()
+
+Description:
+This method retrieves the total number of spam and the total number of ham
+currently under storage.
+
+=cut
+
+sub nspam_nham_get {
+  my ($self) = @_;
+  die "osbf: nspam_nham_get: not implemented\n";
+}
+
+=item nspam_nham_change
+
+public instance (Boolean) nspam_nham_change (Integer $num_spam,
+                                             Integer $num_ham)
+
+Description:
+This method updates the number of spam and the number of ham in the database.
+
+=cut
+
+sub nspam_nham_change {
+  my ($self, $num_spam, $num_ham) = @_;
+  die "osbf: nspam_nham_change: not implemented\n";
+}
+
+=item tok_touch
+
+public instance (Boolean) tok_touch (String $token,
+                                     Time $atime)
+
+Description:
+This method updates the given tokens (C<$token>) access time.
+
+=cut
+
+sub tok_touch {
+  my ($self, $token, $atime) = @_;
+  die "osbf: tok_touch: not implemanted\n";
+}
+
+=item tok_touch_all
+
+public instance (Boolean) tok_touch_all (\@ $tokens,
+                                         Time $atime)
+
+Description:
+This method does a mass update of the given list of tokens C<$tokens>, if the existing token
+atime is < C<$atime>.
+
+=cut
+
+sub tok_touch_all {
+  my ($self, $tokens, $atime) = @_;
+  die "osbf: tok_touch_all: not implemanted\n";
+}
+
+=item cleanup
+
+public instance (Boolean) cleanup ()
+
+Description:
+This method performs any cleanup necessary before moving onto the next
+operation.
+
+=cut
+
+sub cleanup {
+  my ($self) = @_;
+  die "osbf: cleanup: not implemented\n";
+}
+
+=item get_magic_re
+
+public instance get_magic_re (String)
+
+Description:
+This method returns a regexp which indicates a magic token.
+
+=cut
+
+sub get_magic_re {
+  my ($self) = @_;
+  die "osbf: get_magic_re: not implemented\n";
+}
+
+=item sync
+
+public instance (Boolean) sync (\% $opts)
+
+Description:
+This method performs a sync of the database.
+
+=cut
+
+sub sync {
+  my ($self, $opts) = @_;
+  die "osbf: sync: not implemented\n";
+}
+
+=item perform_upgrade
+
+public instance (Boolean) perform_upgrade (\% $opts)
+
+Description:
+This method is a utility method that performs any necessary upgrades
+between versions.  It should know how to handle previous versions and
+what needs to happen to upgrade them.
+
+A true return value indicates success.
+
+=cut
+
+sub perform_upgrade {
+  my ($self, $opts) = @_;
+  die "osbf: perform_upgrade: not implemented\n";
+}
+
+=item clear_database
+
+public instance (Boolean) clear_database ()
+
+Description:
+This method deletes all records for a particular user.
+
+Callers should be aware that any errors returned by this method
+could causes the database to be inconsistent for the given user.
+
+=cut
+
+sub clear_database {
+  my ($self) = @_;
+  die "osbf: clear_database: not implemented\n";
+}
+
+=item backup_database
+
+public instance (Boolean) backup_database ()
+
+Description:
+This method will dump the users database in a machine readable format.
+
+=cut
+
+sub backup_database {
+  my ($self) = @_;
+  die "osbf: backup_database: not implemented\n";
+}
+
+=item restore_database
+
+public instance (Boolean) restore_database (String $filename, Boolean $showdots)
+
+Description:
+This method restores a database from the given filename, C<$filename>.
+
+Callers should be aware that any errors returned by this method
+could causes the database to be inconsistent for the given user.
+
+=cut
+
+sub restore_database {
+  my ($self, $filename, $showdots) = @_;
+  die "osbf: restore_database: not implemented\n";
+}
+
+=item db_readable
+
+public instance (Boolean) db_readable ()
+
+Description:
+This method returns whether or not the OSBF DB is available in a
+readable state.
+
+=cut
+
+sub db_readable {
+  my ($self) = @_;
+  die "osbf: db_readable: not implemented\n";
+}
+
+=item db_writable
+
+public instance (Boolean) db_writable ()
+
+Description:
+This method returns whether or not the OSBF DB is available in a
+writable state.
+
+=cut
+
+sub db_writable {
+  my ($self) = @_;
+  die "osbf: db_writable: not implemented\n";
+}
+
+
+sub sa_die { Mail::SpamAssassin::sa_die(@_); }
+
+1;
+
+=back
+
+=cut
Index: lib/Mail/SpamAssassin/OSBF/Store/DBM.pm
===================================================================
--- lib/Mail/SpamAssassin/OSBF/Store/DBM.pm	(.../trunk)	(revision 0)
+++ lib/Mail/SpamAssassin/OSBF/Store/DBM.pm	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -0,0 +1,1906 @@
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+
+package Mail::SpamAssassin::OSBF::Store::DBM;
+
+use strict;
+use warnings;
+use bytes;
+use Fcntl;
+use re 'taint';
+
+use Mail::SpamAssassin;
+use Mail::SpamAssassin::Util qw(untaint_var);
+use Mail::SpamAssassin::OSBF::Store;
+use Mail::SpamAssassin::Logger;
+use Digest::SHA1 qw(sha1);
+use File::Basename;
+use File::Spec;
+use File::Path;
+
+use constant MAGIC_RE    => qr/^\015\001\007\011\003/;
+
+use vars qw{
+  @ISA
+  @DBNAMES
+  $NSPAM_MAGIC_TOKEN $NHAM_MAGIC_TOKEN $LAST_EXPIRE_MAGIC_TOKEN $LAST_JOURNAL_SYNC_MAGIC_TOKEN
+  $NTOKENS_MAGIC_TOKEN $OLDEST_TOKEN_AGE_MAGIC_TOKEN $LAST_EXPIRE_REDUCE_MAGIC_TOKEN
+  $RUNNING_EXPIRE_MAGIC_TOKEN $DB_VERSION_MAGIC_TOKEN $LAST_ATIME_DELTA_MAGIC_TOKEN
+  $NEWEST_TOKEN_AGE_MAGIC_TOKEN
+};
+
+@ISA = qw( Mail::SpamAssassin::OSBF::Store );
+
+@DBNAMES = qw(toks seen);
+
+# These are the magic tokens we use to track stuff in the DB.
+# The format is '^M^A^G^I^C' followed by any string you want.
+# None of the control chars will be in a real token.
+$DB_VERSION_MAGIC_TOKEN		= "\015\001\007\011\003DBVERSION";
+$LAST_ATIME_DELTA_MAGIC_TOKEN	= "\015\001\007\011\003LASTATIMEDELTA";
+$LAST_EXPIRE_MAGIC_TOKEN	= "\015\001\007\011\003LASTEXPIRE";
+$LAST_EXPIRE_REDUCE_MAGIC_TOKEN	= "\015\001\007\011\003LASTEXPIREREDUCE";
+$LAST_JOURNAL_SYNC_MAGIC_TOKEN	= "\015\001\007\011\003LASTJOURNALSYNC";
+$NEWEST_TOKEN_AGE_MAGIC_TOKEN	= "\015\001\007\011\003NEWESTAGE";
+$NHAM_MAGIC_TOKEN		= "\015\001\007\011\003NHAM";
+$NSPAM_MAGIC_TOKEN		= "\015\001\007\011\003NSPAM";
+$NTOKENS_MAGIC_TOKEN		= "\015\001\007\011\003NTOKENS";
+$OLDEST_TOKEN_AGE_MAGIC_TOKEN	= "\015\001\007\011\003OLDESTAGE";
+$RUNNING_EXPIRE_MAGIC_TOKEN	= "\015\001\007\011\003RUNNINGEXPIRE";
+
+sub HAS_DBM_MODULE {
+  my ($self) = @_;
+  if (exists($self->{has_dbm_module})) {
+    return $self->{has_dbm_module};
+  }
+  $self->{has_dbm_module} = eval { require DB_File; };
+}
+
+sub DBM_MODULE {
+  return "DB_File";
+}
+
+# Possible file extensions used by the kinds of database files DB_File
+# might create.  We need these so we can create a new file and rename
+# it into place.
+sub DB_EXTENSIONS {
+  return ('', '.db');
+}
+
+###########################################################################
+
+sub new {
+  my $class = shift;
+  $class = ref($class) || $class;
+
+  my $self = $class->SUPER::new(@_);
+
+  $self->{supported_db_version} = 3;
+
+  $self->{already_tied} = 0;
+  $self->{is_locked} = 0;
+  $self->{string_to_journal} = '';
+  $self->{last_refresh} = 0;
+
+  $self;
+}
+
+###########################################################################
+
+sub tie_db_readonly {
+  my ($self) = @_;
+
+  if (!$self->HAS_DBM_MODULE) {
+    dbg("osbf: " . $self->DBM_MODULE . " module not installed, cannot use OSBF");
+    return 0;
+  }
+
+  # return if we've already tied to the db's, using the same mode
+  # (locked/unlocked) as before.
+  return 1 if ($self->{already_tied} && $self->{is_locked} == 0);
+
+  my $main = $self->{osbf}->{main};
+  if (!defined($main->{conf}->{osbf_path})) {
+    dbg("osbf: osbf_path not defined");
+    return 0;
+  }
+
+  $self->read_db_configs();
+
+  my $path = $main->sed_path($main->{conf}->{osbf_path});
+
+  my $found = 0;
+  for my $ext ($self->DB_EXTENSIONS) {
+    if (-f $path.'_toks'.$ext) {
+      $found = 1;
+      last;
+    }
+  }
+
+  if (!$found) {
+    dbg("osbf: no dbs present, cannot tie DB R/O: ${path}_toks");
+    return 0;
+  }
+
+  foreach my $dbname (@DBNAMES) {
+    my $name = $path.'_'.$dbname;
+    my $db_var = 'db_'.$dbname;
+    dbg("osbf: tie-ing to DB file R/O $name");
+
+    # untie %{$self->{$db_var}} if (tied %{$self->{$db_var}});
+    if (!tie %{$self->{$db_var}},$self->DBM_MODULE, $name, O_RDONLY,
+		 (oct($main->{conf}->{osbf_file_mode}) & 0666))
+    {
+      # bug 2975: it's acceptable for the db_seen to not be present,
+      # to allow it to be recycled.  if that's the case, just create
+      # a new, empty one. we don't need to lock it, since we won't
+      # be writing to it; let the R/W api deal with that case.
+
+      if ($dbname eq 'seen') {
+        tie %{$self->{$db_var}},$self->DBM_MODULE, $name, O_RDWR|O_CREAT,
+                    (oct($main->{conf}->{osbf_file_mode}) & 0666)
+          or goto failed_to_tie;
+      }
+      else {
+        goto failed_to_tie;
+      }
+    }
+  }
+
+  $self->{db_version} = ($self->get_storage_variables())[6];
+  dbg("osbf: found bayes db version ".$self->{db_version});
+
+  # If the DB version is one we don't understand, abort!
+  if ($self->_check_db_version() != 0) {
+    warn("osbf: bayes db version ".$self->{db_version}." is not able to be used, aborting!");
+    $self->untie_db();
+    return 0;
+  }
+
+  $self->{already_tied} = 1;
+  return 1;
+
+failed_to_tie:
+  warn "osbf: cannot open bayes databases ${path}_* R/O: tie failed: $!\n";
+  foreach my $dbname (@DBNAMES) {
+    my $db_var = 'db_'.$dbname;
+    next unless exists $self->{$db_var};
+    dbg("osbf: untie-ing DB file $dbname");
+    untie %{$self->{$db_var}};
+  }
+
+  return 0;
+}
+
+# tie() to the databases, read-write and locked.  Any callers of
+# this should ensure they call untie_db() afterwards!
+#
+sub tie_db_writable {
+  my ($self) = @_;
+
+  if (!$self->HAS_DBM_MODULE) {
+    dbg("osbf: " . $self->DBM_MODULE . " module not installed, cannot use bayes");
+    return 0;
+  }
+
+  # Useful shortcut ...
+  my $main = $self->{osbf}->{main};
+
+  # if we've already tied the db's using the same mode
+  # (locked/unlocked) as we want now, freshen the lock and return.
+  if ($self->{already_tied} && $self->{is_locked} == 1) {
+
+    # don't refresh too frequently; once every 15 secs should be plenty
+    my $now = time;
+    if (!$self->{last_refresh} || ($now - $self->{last_refresh} > 15)) {
+      $self->{last_refresh} = $now;
+      $main->{locker}->refresh_lock($self->{locked_file});
+    }
+
+    return 1;
+  }
+
+  if (!defined($main->{conf}->{osbf_path})) {
+    dbg("osbf: osbf_path not defined");
+    return 0;
+  }
+
+  $self->read_db_configs();
+
+  my $path = $main->sed_path($main->{conf}->{osbf_path});
+
+  my $found = 0;
+  for my $ext ($self->DB_EXTENSIONS) {
+    if (-f $path.'_toks'.$ext) {
+      $found = 1;
+      last;
+    }
+  }
+
+  my $parentdir = dirname($path);
+  if (!-d $parentdir) {
+    # run in an eval(); if mkpath has no perms, it calls die()
+    eval {
+      mkpath($parentdir, 0, (oct($main->{conf}->{osbf_file_mode}) & 0777));
+    };
+  }
+
+  my $tout;
+  if ($main->{learn_wait_for_lock}) {
+    $tout = 300;       # TODO: Dan to write better lock code
+  } else {
+    $tout = 10;
+  }
+  if ($main->{locker}->safe_lock($path, $tout, $main->{conf}->{osbf_file_mode}))
+  {
+    $self->{locked_file} = $path;
+    $self->{is_locked} = 1;
+  } else {
+    warn "osbf: cannot open bayes databases ${path}_* R/W: lock failed: $!\n";
+    return 0;
+  }
+
+  my $umask = umask 0;
+  foreach my $dbname (@DBNAMES) {
+    my $name = $path.'_'.$dbname;
+    my $db_var = 'db_'.$dbname;
+    dbg("osbf: tie-ing to DB file R/W $name");
+
+    ($self->DBM_MODULE eq 'DB_File') and
+         Mail::SpamAssassin::Util::avoid_db_file_locking_bug ($name);
+
+    tie %{$self->{$db_var}},$self->DBM_MODULE, $name, O_RDWR|O_CREAT,
+		 (oct($main->{conf}->{osbf_file_mode}) & 0666)
+       or goto failed_to_tie;
+  }
+  umask $umask;
+
+  # set our cache to what version DB we're using
+  $self->{db_version} = ($self->get_storage_variables())[6];
+  # don't bother printing this unless found since it would be bogus anyway
+  dbg("osbf: found bayes db version ".$self->{db_version}) if ($found);
+
+  # figure out if we can read the current DB and if we need to do a
+  # DB version update and do it if necessary if either has a problem,
+  # fail immediately
+  #
+  if ($found && !$self->_upgrade_db()) {
+    $self->untie_db();
+    return 0;
+  }
+  elsif (!$found) { # new DB, make sure we know that ...
+    $self->{db_version} = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION;
+    $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN} = 0; # no tokens in the db ...
+    dbg("osbf: new db, set db version ".$self->{db_version}." and 0 tokens");
+  }
+
+  $self->{already_tied} = 1;
+  return 1;
+
+failed_to_tie:
+  my $err = $!;
+  umask $umask;
+
+  foreach my $dbname (@DBNAMES) {
+    my $db_var = 'db_'.$dbname;
+    next unless exists $self->{$db_var};
+    dbg("osbf: untie-ing DB file $dbname");
+    untie %{$self->{$db_var}};
+  }       
+
+  if ($self->{is_locked}) {
+    $self->{osbf}->{main}->{locker}->safe_unlock($self->{locked_file});
+    $self->{is_locked} = 0;
+  }
+  warn "osbf: cannot open bayes databases ${path}_* R/W: tie failed: $err\n";
+  return 0;
+}
+
+# Do we understand how to deal with this DB version?
+sub _check_db_version {
+  my ($self) = @_;
+
+  # return -1 if older, 0 if current, 1 if newer
+  return $self->{db_version} <=> $self->DB_VERSION;
+}
+
+# Check to see if we need to upgrade the DB, and do so if necessary
+sub _upgrade_db {
+  my ($self) = @_;
+
+  my $verschk = $self->_check_db_version();
+  my $res = 0; # used later on for tie() checks
+  my $umask; # used later for umask modifications
+
+  # If the DB is the latest version, no problem.
+  return 1 if ($verschk == 0);
+
+  # If the DB is a newer version that we know what to do with ... abort!
+  if ($verschk == 1) {
+    warn("osbf: bayes db version ".$self->{db_version}." is newer than we understand, aborting!");
+    return 0;
+  }
+
+  # If the current DB version is lower than the new version, upgrade!
+  # Do conversions in order so we can go 1 -> 3, make sure to update
+  #   $self->{db_version} along the way
+
+  dbg("osbf: detected bayes db format ".$self->{db_version}.", upgrading");
+
+  # since DB_File will not shrink a database (!!), we need to *create*
+  # a new one instead.
+  my $main = $self->{osbf}->{main};
+  my $path = $main->sed_path($main->{conf}->{osbf_path});
+  my $name = $path.'_toks';
+
+  # older version's journal files are likely not in the same format as the new ones, so remove it.
+  my $jpath = $self->_get_journal_filename();
+  if (-f $jpath) {
+    dbg("osbf: old journal file found, removing");
+    warn "osbf: couldn't remove $jpath: $!" if (!unlink $jpath);
+  }
+
+  if ($self->{db_version} < 2) {
+    dbg("osbf: upgrading database format from v".$self->{db_version}." to v2");
+    $self->set_running_expire_tok();
+
+    my ($DB_NSPAM_MAGIC_TOKEN, $DB_NHAM_MAGIC_TOKEN, $DB_NTOKENS_MAGIC_TOKEN);
+    my ($DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN, $DB_LAST_EXPIRE_MAGIC_TOKEN);
+
+    # Magic tokens for version 0, defined as '**[A-Z]+'
+    if ($self->{db_version} == 0) {
+      $DB_NSPAM_MAGIC_TOKEN			= '**NSPAM';
+      $DB_NHAM_MAGIC_TOKEN			= '**NHAM';
+      $DB_NTOKENS_MAGIC_TOKEN			= '**NTOKENS';
+      #$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN		= '**OLDESTAGE';
+      #$DB_LAST_EXPIRE_MAGIC_TOKEN		= '**LASTEXPIRE';
+      #$DB_SCANCOUNT_BASE_MAGIC_TOKEN		= '**SCANBASE';
+      #$DB_RUNNING_EXPIRE_MAGIC_TOKEN		= '**RUNNINGEXPIRE';
+    }
+    else {
+      $DB_NSPAM_MAGIC_TOKEN			= "\015\001\007\011\003NSPAM";
+      $DB_NHAM_MAGIC_TOKEN			= "\015\001\007\011\003NHAM";
+      $DB_NTOKENS_MAGIC_TOKEN			= "\015\001\007\011\003NTOKENS";
+      #$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN		= "\015\001\007\011\003OLDESTAGE";
+      #$DB_LAST_EXPIRE_MAGIC_TOKEN		= "\015\001\007\011\003LASTEXPIRE";
+      #$DB_SCANCOUNT_BASE_MAGIC_TOKEN		= "\015\001\007\011\003SCANBASE";
+      #$DB_RUNNING_EXPIRE_MAGIC_TOKEN		= "\015\001\007\011\003RUNNINGEXPIRE";
+    }
+
+    # remember when we started ...
+    my $started = time;
+    my $newatime = $started;
+
+    # use O_EXCL to avoid races (bonus paranoia, since we should be locked
+    # anyway)
+    my %new_toks;
+    $umask = umask 0;
+    $res = tie %new_toks, $self->DBM_MODULE, "${name}.new", O_RDWR|O_CREAT|O_EXCL,
+          (oct($main->{conf}->{osbf_file_mode}) & 0666);
+    umask $umask;
+    return 0 unless $res;
+    undef $res;
+
+    # add the magic tokens to the new db.
+    $new_toks{$NSPAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NSPAM_MAGIC_TOKEN};
+    $new_toks{$NHAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NHAM_MAGIC_TOKEN};
+    $new_toks{$NTOKENS_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NTOKENS_MAGIC_TOKEN};
+    $new_toks{$DB_VERSION_MAGIC_TOKEN} = 2; # we're now a DB version 2 file
+    $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $newatime;
+    $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = $newatime;
+    $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $newatime;
+    $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $newatime;
+    $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0;
+    $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0;
+
+    # deal with the data tokens
+    my ($tok, $packed);
+    my $count = 0;
+    while (($tok, $packed) = each %{$self->{db_toks}}) {
+      next if ($tok =~ /^(?:\*\*[A-Z]+$|\015\001\007\011\003)/); # skip magic tokens
+
+      my ($ts, $th, $atime) = $self->tok_unpack($packed);
+      $new_toks{$tok} = $self->tok_pack($ts, $th, $newatime);
+
+      # Refresh the lock every so often...
+      if (($count++ % 1000) == 0) {
+        $self->set_running_expire_tok();
+      }
+    }
+
+
+    # now untie so we can do renames
+    untie %{$self->{db_toks}};
+    untie %new_toks;
+
+    # This is the critical phase (moving files around), so don't allow
+    # it to be interrupted.
+    local $SIG{'INT'} = 'IGNORE';
+    local $SIG{'TERM'} = 'IGNORE';
+    local $SIG{'HUP'} = 'IGNORE' if (!Mail::SpamAssassin::Util::am_running_on_windows());
+
+    # older versions used scancount, so kill the stupid little file ...
+    my $msgc = $path.'_msgcount';
+    if (-f $msgc) {
+      dbg("osbf: old msgcount file found, removing");
+      if (!unlink $msgc) {
+        warn "osbf: couldn't remove $msgc: $!";
+      }
+    }
+
+    # now rename in the new one.  Try several extensions
+    for my $ext ($self->DB_EXTENSIONS) {
+      my $newf = $name.'.new'.$ext;
+      my $oldf = $name.$ext;
+      next unless (-f $newf);
+      if (!rename ($newf, $oldf)) {
+        warn "osbf: rename $newf to $oldf failed: $!\n";
+        return 0;
+      }
+    }
+
+    # re-tie to the new db in read-write mode ...
+    $umask = umask 0;
+    $res = tie %{$self->{db_toks}},$self->DBM_MODULE, $name, O_RDWR|O_CREAT,
+	 (oct($main->{conf}->{osbf_file_mode}) & 0666);
+    umask $umask;
+    return 0 unless $res;
+    undef $res;
+
+    dbg("osbf: upgraded database format from v".$self->{db_version}." to v2 in ".(time - $started)." seconds");
+    $self->{db_version} = 2; # need this for other functions which check
+  }
+
+  # Version 3 of the database converts all existing tokens to SHA1 hashes
+  if ($self->{db_version} == 2) {
+    dbg("osbf: upgrading database format from v".$self->{db_version}." to v3");
+    $self->set_running_expire_tok();
+
+    my $DB_NSPAM_MAGIC_TOKEN		  = "\015\001\007\011\003NSPAM";
+    my $DB_NHAM_MAGIC_TOKEN		  = "\015\001\007\011\003NHAM";
+    my $DB_NTOKENS_MAGIC_TOKEN		  = "\015\001\007\011\003NTOKENS";
+    my $DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN	  = "\015\001\007\011\003OLDESTAGE";
+    my $DB_LAST_EXPIRE_MAGIC_TOKEN	  = "\015\001\007\011\003LASTEXPIRE";
+    my $DB_NEWEST_TOKEN_AGE_MAGIC_TOKEN	  = "\015\001\007\011\003NEWESTAGE";
+    my $DB_LAST_JOURNAL_SYNC_MAGIC_TOKEN  = "\015\001\007\011\003LASTJOURNALSYNC";
+    my $DB_LAST_ATIME_DELTA_MAGIC_TOKEN	  = "\015\001\007\011\003LASTATIMEDELTA";
+    my $DB_LAST_EXPIRE_REDUCE_MAGIC_TOKEN = "\015\001\007\011\003LASTEXPIREREDUCE";
+
+    # remember when we started ...
+    my $started = time;
+
+    # use O_EXCL to avoid races (bonus paranoia, since we should be locked
+    # anyway)
+    my %new_toks;
+    $umask = umask 0;
+    $res = tie %new_toks, $self->DBM_MODULE, "${name}.new", O_RDWR|O_CREAT|O_EXCL,
+          (oct($main->{conf}->{osbf_file_mode}) & 0666);
+    umask $umask;
+    return 0 unless $res;
+    undef $res;
+
+    # add the magic tokens to the new db.
+    $new_toks{$NSPAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NSPAM_MAGIC_TOKEN};
+    $new_toks{$NHAM_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NHAM_MAGIC_TOKEN};
+    $new_toks{$NTOKENS_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NTOKENS_MAGIC_TOKEN};
+    $new_toks{$DB_VERSION_MAGIC_TOKEN} = 3; # we're now a DB version 3 file
+    $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $self->{db_toks}->{$DB_OLDEST_TOKEN_AGE_MAGIC_TOKEN};
+    $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = $self->{db_toks}->{$DB_LAST_EXPIRE_MAGIC_TOKEN};
+    $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $self->{db_toks}->{$DB_NEWEST_TOKEN_AGE_MAGIC_TOKEN};
+    $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $self->{db_toks}->{$DB_LAST_JOURNAL_SYNC_MAGIC_TOKEN};
+    $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = $self->{db_toks}->{$DB_LAST_ATIME_DELTA_MAGIC_TOKEN};
+    $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} =$self->{db_toks}->{$DB_LAST_EXPIRE_REDUCE_MAGIC_TOKEN};
+
+    # deal with the data tokens
+    my $count = 0;
+    while (my ($tok, $packed) = each %{$self->{db_toks}}) {
+      next if ($tok =~ /^\015\001\007\011\003/); # skip magic tokens
+      my $tok_hash = substr(sha1($tok), -5);
+      $new_toks{$tok_hash} = $packed;
+
+      # Refresh the lock every so often...
+      if (($count++ % 1000) == 0) {
+        $self->set_running_expire_tok();
+      }
+    }
+
+    # now untie so we can do renames
+    untie %{$self->{db_toks}};
+    untie %new_toks;
+
+    # This is the critical phase (moving files around), so don't allow
+    # it to be interrupted.
+    local $SIG{'INT'} = 'IGNORE';
+    local $SIG{'TERM'} = 'IGNORE';
+    local $SIG{'HUP'} = 'IGNORE' if (!Mail::SpamAssassin::Util::am_running_on_windows());
+
+    # now rename in the new one.  Try several extensions
+    for my $ext ($self->DB_EXTENSIONS) {
+      my $newf = $name.'.new'.$ext;
+      my $oldf = $name.$ext;
+      next unless (-f $newf);
+      if (!rename($newf, $oldf)) {
+        warn "osbf: rename $newf to $oldf failed: $!\n";
+        return 0;
+      }
+    }
+
+    # re-tie to the new db in read-write mode ...
+    $umask = umask 0;
+    $res = tie %{$self->{db_toks}},$self->DBM_MODULE, $name, O_RDWR|O_CREAT,
+	 (oct ($main->{conf}->{osbf_file_mode}) & 0666);
+    umask $umask;
+    return 0 unless $res;
+    undef $res;
+
+    dbg("osbf: upgraded database format from v".$self->{db_version}." to v3 in ".(time - $started)." seconds");
+
+    $self->{db_version} = 3; # need this for other functions which check
+  }
+
+  # if ($self->{db_version} == 3) {
+  #   ...
+  #   $self->{db_version} = 4; # need this for other functions which check
+  # }
+  # ... and so on.
+
+  return 1;
+}
+
+###########################################################################
+
+sub untie_db {
+  my $self = shift;
+
+  return if (!$self->{already_tied});
+
+  dbg("osbf: untie-ing");
+
+  foreach my $dbname (@DBNAMES) {
+    my $db_var = 'db_'.$dbname;
+
+    if (exists $self->{$db_var}) {
+      # dbg("osbf: untie-ing $db_var");
+      untie %{$self->{$db_var}};
+      delete $self->{$db_var};
+    }
+  }
+
+  if ($self->{is_locked}) {
+    dbg("osbf: files locked, now unlocking lock");
+    $self->{osbf}->{main}->{locker}->safe_unlock ($self->{locked_file});
+    $self->{is_locked} = 0;
+  }
+
+  $self->{already_tied} = 0;
+  $self->{db_version} = undef;
+}
+
+###########################################################################
+
+sub calculate_expire_delta {
+  my ($self, $newest_atime, $start, $max_expire_mult) = @_;
+
+  my %delta;  # use a hash since an array is going to be very sparse
+
+  # do the first pass, figure out atime delta
+  my ($tok, $packed);
+  while (($tok, $packed) = each %{$self->{db_toks}}) {
+    next if ($tok =~ MAGIC_RE); # skip magic tokens
+    
+    my ($ts, $th, $atime) = $self->tok_unpack ($packed);
+
+    # Go through from $start * 1 to $start * 512, mark how many tokens
+    # we would expire
+    my $token_age = $newest_atime - $atime;
+    for (my $i = 1; $i <= $max_expire_mult; $i<<=1) {
+      if ($token_age >= $start * $i) {
+        $delta{$i}++;
+      }
+      else {
+        # If the token age is less than the expire delta, it'll be
+        # less for all upcoming checks too, so abort early.
+        last;
+      }
+    }
+  }
+  return %delta;
+}
+
+###########################################################################
+
+sub token_expiration {
+  my ($self, $opts, $newdelta, @vars) = @_;
+
+  my $deleted = 0;
+  my $kept = 0;
+  my $num_hapaxes = 0;
+  my $num_lowfreq = 0;
+
+  # since DB_File will not shrink a database (!!), we need to *create*
+  # a new one instead.
+  my $main = $self->{osbf}->{main};
+  my $path = $main->sed_path($main->{conf}->{osbf_path});
+
+  # use a temporary PID-based suffix just in case another one was
+  # created previously by an interrupted expire
+  my $tmpsuffix = "expire$$";
+  my $tmpdbname = $path.'_toks.'.$tmpsuffix;
+
+  # clean out any leftover db copies from previous runs
+  for my $ext ($self->DB_EXTENSIONS) { unlink ($tmpdbname.$ext); }
+
+  # use O_EXCL to avoid races (bonus paranoia, since we should be locked
+  # anyway)
+  my %new_toks;
+  my $umask = umask 0;
+  tie %new_toks, $self->DBM_MODULE, $tmpdbname, O_RDWR|O_CREAT|O_EXCL,
+              (oct ($main->{conf}->{osbf_file_mode}) & 0666);
+  umask $umask;
+  my $oldest;
+
+  my $showdots = $opts->{showdots};
+  if ($showdots) { print STDERR "\n"; }
+
+  # We've chosen a new atime delta if we've gotten here, so record it
+  # for posterity.
+  $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = $newdelta;
+
+  # Figure out how old is too old...
+  my $too_old = $vars[10] - $newdelta; # tooold = newest - delta
+
+  # Go ahead and do the move to new db/expire run now ...
+  my ($tok, $packed);
+  while (($tok, $packed) = each %{$self->{db_toks}}) {
+    next if ($tok =~ MAGIC_RE); # skip magic tokens
+
+    my ($ts, $th, $atime) = $self->tok_unpack ($packed);
+
+    if ($atime < $too_old) {
+      $deleted++;
+    }
+    else {
+      # if token atime > newest, reset to newest ...
+      if ($atime > $vars[10]) {
+        $atime = $vars[10];
+      }
+
+      $new_toks{$tok} = $self->tok_pack ($ts, $th, $atime); $kept++;
+      if (!defined($oldest) || $atime < $oldest) { $oldest = $atime; }
+      if ($ts + $th == 1) {
+	$num_hapaxes++;
+      } elsif ($ts < 8 && $th < 8) {
+	$num_lowfreq++;
+      }
+    }
+
+    if ((($kept + $deleted) % 1000) == 0) {
+      if ($showdots) { print STDERR "."; }
+      $self->set_running_expire_tok();
+    }
+  }
+
+  # and add the magic tokens.  don't add the expire_running token.
+  $new_toks{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION;
+
+  # We haven't changed messages of each type seen, so just copy over.
+  $new_toks{$NSPAM_MAGIC_TOKEN} = $vars[1];
+  $new_toks{$NHAM_MAGIC_TOKEN} = $vars[2];
+
+  # We magically haven't removed the newest token, so just copy that value over.
+  $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $vars[10];
+
+  # The rest of these have been modified, so replace as necessary.
+  $new_toks{$NTOKENS_MAGIC_TOKEN} = $kept;
+  $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = time();
+  $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $oldest;
+  $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = $deleted;
+
+  # Sanity check: if we expired too many tokens, abort!
+  if ($kept < 100000) {
+    dbg("osbf: token expiration would expire too many tokens, aborting");
+    # set the magic tokens appropriately
+    # make sure the next expire run does a first pass
+    $self->{db_toks}->{$LAST_EXPIRE_MAGIC_TOKEN} = time();
+    $self->{db_toks}->{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0;
+    $self->{db_toks}->{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0;
+
+    # remove the new DB
+    untie %new_toks;
+    for my $ext ($self->DB_EXTENSIONS) { unlink ($tmpdbname.$ext); }
+
+    # reset the results for the return
+    $kept = $vars[3];
+    $deleted = 0;
+    $num_hapaxes = 0;
+    $num_lowfreq = 0;
+  }
+  else {
+    # now untie so we can do renames
+    untie %{$self->{db_toks}};
+    untie %new_toks;
+
+    # This is the critical phase (moving files around), so don't allow
+    # it to be interrupted.  Scope the signal changes.
+    {
+      local $SIG{'INT'} = 'IGNORE';
+      local $SIG{'TERM'} = 'IGNORE';
+      local $SIG{'HUP'} = 'IGNORE' if (!Mail::SpamAssassin::Util::am_running_on_windows());
+
+      # now rename in the new one.  Try several extensions
+      for my $ext ($self->DB_EXTENSIONS) {
+        my $newf = $tmpdbname.$ext;
+        my $oldf = $path.'_toks'.$ext;
+        next unless (-f $newf);
+        if (!rename ($newf, $oldf)) {
+	  warn "osbf: rename $newf to $oldf failed: $!\n";
+        }
+      }
+    }
+  }
+
+  # Call untie_db() so we unlock correctly.
+  $self->untie_db();
+
+  return ($kept, $deleted, $num_hapaxes, $num_lowfreq);
+}
+
+###########################################################################
+
+# Is a sync due?
+sub sync_due {
+  my ($self) = @_;
+
+  # don't bother doing old db versions
+  return 0 if ($self->{db_version} < $self->DB_VERSION);
+
+  my $conf = $self->{osbf}->{main}->{conf};
+  return 0 if ($conf->{bayes_journal_max_size} == 0);
+
+  my @vars = $self->get_storage_variables();
+  dbg("osbf: DB journal sync: last sync: ".$vars[7],'bayes','-1');
+
+  ## Ok, should we do a sync?
+
+  # Not if the journal file doesn't exist, it's not a file, or it's 0
+  # bytes long.
+  return 0 unless (stat($self->_get_journal_filename()) && -f _);
+
+  # Yes if the file size is larger than the specified maximum size.
+  return 1 if (-s _ > $conf->{bayes_journal_max_size});
+
+  # Yes there has been a sync before, and if it's been at least a day
+  # since that sync.
+  return 1 if (($vars[7] > 0) && (time - $vars[7] > 86400));
+
+  # No, I guess not.
+  return 0;
+}
+
+###########################################################################
+# db_seen reading APIs
+
+sub seen_get {
+  my ($self, $msgid) = @_;
+  $self->{db_seen}->{$msgid};
+}
+
+sub seen_put {
+  my ($self, $msgid, $seen) = @_;
+
+  if ($self->{osbf}->{main}->{learn_to_journal}) {
+    $self->defer_update ("m $seen $msgid");
+  }
+  else {
+    $self->_seen_put_direct($msgid, $seen);
+  }
+}
+sub _seen_put_direct {
+  my ($self, $msgid, $seen) = @_;
+  $self->{db_seen}->{$msgid} = $seen;
+}
+
+sub seen_delete {
+  my ($self, $msgid) = @_;
+
+  if ($self->{osbf}->{main}->{learn_to_journal}) {
+    $self->defer_update ("m f $msgid");
+  }
+  else {
+    $self->_seen_delete_direct($msgid);
+  }
+}
+sub _seen_delete_direct {
+  my ($self, $msgid) = @_;
+  delete $self->{db_seen}->{$msgid};
+}
+
+###########################################################################
+# db reading APIs
+
+sub tok_get {
+  my ($self, $tok) = @_;
+  $self->tok_unpack ($self->{db_toks}->{$tok});
+}
+ 
+sub tok_get_all {
+  my ($self, @tokens) = @_;
+
+  my @tokensdata;
+  foreach my $token (@tokens) {
+    my ($tok_spam, $tok_ham, $atime) = $self->tok_unpack($self->{db_toks}->{$token});
+    push(@tokensdata, [$token, $tok_spam, $tok_ham, $atime]);
+  }
+  return \@tokensdata;
+}
+
+# return the magic tokens in a specific order:
+# 0: scan count base
+# 1: number of spam
+# 2: number of ham
+# 3: number of tokens in db
+# 4: last expire atime
+# 5: oldest token in db atime
+# 6: db version value
+# 7: last journal sync
+# 8: last atime delta
+# 9: last expire reduction count
+# 10: newest token in db atime
+#
+sub get_storage_variables {
+  my ($self) = @_;
+  my @values;
+
+  my $db_ver = $self->{db_toks}->{$DB_VERSION_MAGIC_TOKEN};
+
+  if (!$db_ver || $db_ver =~ /\D/) { $db_ver = 0; }
+
+  if ($db_ver >= 2) {
+    my $DB2_LAST_ATIME_DELTA_MAGIC_TOKEN	= "\015\001\007\011\003LASTATIMEDELTA";
+    my $DB2_LAST_EXPIRE_MAGIC_TOKEN		= "\015\001\007\011\003LASTEXPIRE";
+    my $DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN	= "\015\001\007\011\003LASTEXPIREREDUCE";
+    my $DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN	= "\015\001\007\011\003LASTJOURNALSYNC";
+    my $DB2_NEWEST_TOKEN_AGE_MAGIC_TOKEN	= "\015\001\007\011\003NEWESTAGE";
+    my $DB2_NHAM_MAGIC_TOKEN			= "\015\001\007\011\003NHAM";
+    my $DB2_NSPAM_MAGIC_TOKEN			= "\015\001\007\011\003NSPAM";
+    my $DB2_NTOKENS_MAGIC_TOKEN			= "\015\001\007\011\003NTOKENS";
+    my $DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN	= "\015\001\007\011\003OLDESTAGE";
+    my $DB2_RUNNING_EXPIRE_MAGIC_TOKEN		= "\015\001\007\011\003RUNNINGEXPIRE";
+
+    @values = (
+      0,
+      $self->{db_toks}->{$DB2_NSPAM_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB2_NHAM_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB2_NTOKENS_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB2_LAST_EXPIRE_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB2_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
+      $db_ver,
+      $self->{db_toks}->{$DB2_LAST_JOURNAL_SYNC_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB2_LAST_ATIME_DELTA_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB2_LAST_EXPIRE_REDUCE_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB2_NEWEST_TOKEN_AGE_MAGIC_TOKEN},
+    );
+  }
+  elsif ($db_ver == 0) {
+    my $DB0_NSPAM_MAGIC_TOKEN = '**NSPAM';
+    my $DB0_NHAM_MAGIC_TOKEN = '**NHAM';
+    my $DB0_OLDEST_TOKEN_AGE_MAGIC_TOKEN = '**OLDESTAGE';
+    my $DB0_LAST_EXPIRE_MAGIC_TOKEN = '**LASTEXPIRE';
+    my $DB0_NTOKENS_MAGIC_TOKEN = '**NTOKENS';
+    my $DB0_SCANCOUNT_BASE_MAGIC_TOKEN = '**SCANBASE';
+
+    @values = (
+      $self->{db_toks}->{$DB0_SCANCOUNT_BASE_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB0_NSPAM_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB0_NHAM_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB0_NTOKENS_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB0_LAST_EXPIRE_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB0_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
+      0,
+      0,
+      0,
+      0,
+      0,
+    );
+  }
+  elsif ($db_ver == 1) {
+    my $DB1_NSPAM_MAGIC_TOKEN			= "\015\001\007\011\003NSPAM";
+    my $DB1_NHAM_MAGIC_TOKEN			= "\015\001\007\011\003NHAM";
+    my $DB1_OLDEST_TOKEN_AGE_MAGIC_TOKEN	= "\015\001\007\011\003OLDESTAGE";
+    my $DB1_LAST_EXPIRE_MAGIC_TOKEN		= "\015\001\007\011\003LASTEXPIRE";
+    my $DB1_NTOKENS_MAGIC_TOKEN			= "\015\001\007\011\003NTOKENS";
+    my $DB1_SCANCOUNT_BASE_MAGIC_TOKEN		= "\015\001\007\011\003SCANBASE";
+
+    @values = (
+      $self->{db_toks}->{$DB1_SCANCOUNT_BASE_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB1_NSPAM_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB1_NHAM_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB1_NTOKENS_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB1_LAST_EXPIRE_MAGIC_TOKEN},
+      $self->{db_toks}->{$DB1_OLDEST_TOKEN_AGE_MAGIC_TOKEN},
+      1,
+      0,
+      0,
+      0,
+      0,
+    );
+  }
+
+  foreach (@values) {
+    if (!$_ || $_ =~ /\D/) {
+      $_ = 0;
+    }
+  }
+
+  return @values;
+}
+
+sub dump_db_toks {
+  my ($self, $template, $regex, @vars) = @_;
+
+  while (my ($tok, $tokvalue) = each %{$self->{db_toks}}) {
+    next if ($tok =~ MAGIC_RE); # skip magic tokens
+    next if (defined $regex && ($tok !~ /$regex/o));
+
+    # We have the value already, so just unpack it.
+    my ($ts, $th, $atime) = $self->tok_unpack ($tokvalue);
+    
+    my $prob = $self->{osbf}->_compute_prob_for_token($tok, $vars[1], $vars[2], $ts, $th);
+    $prob ||= 0.5;
+    
+    my $encoded_tok = unpack("H*",$tok);
+    printf $template,$prob,$ts,$th,$atime,$encoded_tok;
+  }
+}
+
+sub set_last_expire {
+  my ($self, $time) = @_;
+  $self->{db_toks}->{$LAST_EXPIRE_MAGIC_TOKEN} = time();
+}
+
+## Don't bother using get_magic_tokens here.  This token should only
+## ever exist when we're running expire, so we don't want to convert it if
+## it's there and we're not expiring ...
+sub get_running_expire_tok {
+  my ($self) = @_;
+  my $running = $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN};
+  if (!$running || $running =~ /\D/) { return undef; }
+  return $running;
+}
+
+sub set_running_expire_tok {
+  my ($self) = @_;
+
+  # update the lock and running expire magic token
+  $self->{osbf}->{main}->{locker}->refresh_lock ($self->{locked_file});
+  $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN} = time();
+}
+
+sub remove_running_expire_tok {
+  my ($self) = @_;
+  delete $self->{db_toks}->{$RUNNING_EXPIRE_MAGIC_TOKEN};
+}
+
+###########################################################################
+
+# db abstraction: allow deferred writes, since we will be frequently
+# writing while checking.
+
+sub tok_count_change {
+  my ($self, $ds, $dh, $tok, $atime) = @_;
+
+  $atime = 0 unless defined $atime;
+
+  if ($self->{bayes}->{main}->{learn_to_journal}) {
+    # we can't store the SHA1 binary value in the journal, so convert it
+    # to a printable value that can be converted back later
+    my $encoded_tok = unpack("H*",$tok);
+    $self->defer_update ("c $ds $dh $atime $encoded_tok");
+  } else {
+    $self->tok_sync_counters ($ds, $dh, $atime, $tok);
+  }
+}
+
+sub multi_tok_count_change {
+  my ($self, $ds, $dh, $tokens, $atime) = @_;
+
+  $atime = 0 unless defined $atime;
+
+  foreach my $tok (keys %{$tokens}) {
+    if ($self->{bayes}->{main}->{learn_to_journal}) {
+      # we can't store the SHA1 binary value in the journal, so convert it
+      # to a printable value that can be converted back later
+      my $encoded_tok = unpack("H*",$tok);
+      $self->defer_update ("c $ds $dh $atime $encoded_tok");
+    } else {
+      $self->tok_sync_counters ($ds, $dh, $atime, $tok);
+    }
+  }
+}
+
+sub nspam_nham_get {
+  my ($self) = @_;
+  my @vars = $self->get_storage_variables();
+  ($vars[1], $vars[2]);
+}
+
+sub nspam_nham_change {
+  my ($self, $ds, $dh) = @_;
+
+  if ($self->{osbf}->{main}->{learn_to_journal}) {
+    $self->defer_update ("n $ds $dh");
+  } else {
+    $self->tok_sync_nspam_nham ($ds, $dh);
+  }
+}
+
+sub tok_touch {
+  my ($self, $tok, $atime) = @_;
+  # we can't store the SHA1 binary value in the journal, so convert it
+  # to a printable value that can be converted back later
+  my $encoded_tok = unpack("H*", $tok);
+  $self->defer_update ("t $atime $encoded_tok");
+}
+
+sub tok_touch_all {
+  my ($self, $tokens, $atime) = @_;
+
+  foreach my $token (@{$tokens}) {
+    # we can't store the SHA1 binary value in the journal, so convert it
+    # to a printable value that can be converted back later
+    my $encoded_tok = unpack("H*", $token);
+    $self->defer_update ("t $atime $encoded_tok");
+  }
+}
+
+sub defer_update {
+  my ($self, $str) = @_;
+  $self->{string_to_journal} .= "$str\n";
+}
+
+###########################################################################
+
+sub cleanup {
+  my ($self) = @_;
+
+  my $nbytes = length ($self->{string_to_journal});
+  return if ($nbytes == 0);
+
+  my $path = $self->_get_journal_filename();
+
+  # use append mode, write atomically, then close, so simultaneous updates are
+  # not lost
+  my $conf = $self->{osbf}->{main}->{conf};
+
+  # set the umask to the inverse of what we want ...
+  my $umask = umask(0777 - (oct ($conf->{osbf_file_mode}) & 0666));
+
+  if (!open (OUT, ">>".$path)) {
+    warn "osbf: cannot write to $path, bayes db update ignored: $!\n";
+    umask $umask; # reset umask
+    return;
+  }
+  umask $umask; # reset umask
+
+  # do not use print() here, it will break up the buffer if it's >8192 bytes,
+  # which could result in two sets of tokens getting mixed up and their
+  # touches missed.
+  my $write_failure = 0;
+  my $original_point = tell OUT;
+  my $len;
+  do {
+    $len = syswrite (OUT, $self->{string_to_journal}, $nbytes);
+
+    # argh, write failure, give up
+    if (!defined $len || $len < 0) {
+      my $err = '';
+      if (!defined $len) {
+	$len = 0;
+	$err = "  ($!)";
+      }
+      warn "osbf: write failed to OSBF journal $path ($len of $nbytes)!$err\n";
+      last;
+    }
+
+    # This shouldn't happen, but could if the fs is full...
+    if ($len != $nbytes) {
+      warn "osbf: partial write to bayes journal $path ($len of $nbytes), recovering\n";
+
+      # we want to be atomic, so revert the journal file back to where
+      # we know it's "good".  if we can't truncate the journal, or we've
+      # tried 5 times to do the write, abort!
+      if (!truncate(OUT, $original_point) || ($write_failure++ > 4)) {
+        warn "osbf: cannot write to bayes journal $path, aborting!\n";
+	last;
+      }
+
+      # if the fs is full, let's give the system a break
+      sleep 1;
+    }
+  } while ($len != $nbytes);
+
+  if (!close OUT) {
+    warn "osbf: cannot write to $path, bayes db update ignored\n";
+  }
+
+  $self->{string_to_journal} = '';
+}
+
+# Return a qr'd RE to match a token with the correct format's magic token
+sub get_magic_re {
+  my ($self) = @_;
+
+  if (!defined $self->{db_version} || $self->{db_version} >= 1) {
+    return MAGIC_RE;
+  }
+
+  # When in doubt, assume v0
+  return qr/^\*\*[A-Z]+$/;
+}
+
+# provide a more generalized public interface into the journal sync
+
+sub sync {
+  my ($self, $opts) = @_;
+
+  return $self->_sync_journal($opts);
+}
+
+###########################################################################
+# And this method reads the journal and applies the changes in one
+# (locked) transaction.
+
+sub _sync_journal {
+  my ($self, $opts) = @_;
+  my $ret = 0;
+
+  my $path = $self->_get_journal_filename();
+
+  # if $path doesn't exist, or it's not a file, or is 0 bytes in length, return
+  if (!stat($path) || !-f _ || -z _) {
+    return 0;
+  }
+
+  my $eval_stat;
+  eval {
+    local $SIG{'__DIE__'};	# do not run user die() traps in here
+    if ($self->tie_db_writable()) {
+      $ret = $self->_sync_journal_trapped($opts, $path);
+    }
+    1;
+  } or do {
+    $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat;
+  };
+
+  # ok, untie from write-mode if we can
+  if (!$self->{osbf}->{main}->{learn_caller_will_untie}) {
+    $self->untie_db();
+  }
+
+  # handle any errors that may have occurred
+  if (defined $eval_stat) {
+    warn "osbf: $eval_stat\n";
+    return 0;
+  }
+
+  $ret;
+}
+
+sub _sync_journal_trapped {
+  my ($self, $opts, $path) = @_;
+
+  # Flag that we're doing work
+  $self->set_running_expire_tok();
+
+  my $started = time();
+  my $count = 0;
+  my $total_count = 0;
+  my %tokens;
+  my $showdots = $opts->{showdots};
+  my $retirepath = $path.".old";
+
+  # if $path doesn't exist, or it's not a file, or is 0 bytes in length,
+  # return we have to check again since the file may have been removed
+  # by a recent bayes db upgrade ...
+  if (!stat($path) || !-f _ || -z _) {
+    return 0;
+  }
+
+  if (!-r $path) { # will we be able to read the file?
+    warn "osbf: bad permissions on journal, can't read: $path\n";
+    return 0;
+  }
+
+  # This is the critical phase (moving files around), so don't allow
+  # it to be interrupted.
+  {
+    local $SIG{'INT'} = 'IGNORE';
+    local $SIG{'TERM'} = 'IGNORE';
+    local $SIG{'HUP'} = 'IGNORE' if (!Mail::SpamAssassin::Util::am_running_on_windows());
+
+    # retire the journal, so we can update the db files from it in peace.
+    # TODO: use locking here
+    if (!rename ($path, $retirepath)) {
+      warn "osbf: failed rename $path to $retirepath\n";
+      return 0;
+    }
+
+    # now read the retired journal
+    if (!open (JOURNAL, "<$retirepath")) {
+      warn "osbf: cannot open read $retirepath\n";
+      return 0;
+    }
+
+
+    # Read the journal
+    while (<JOURNAL>) {
+      $total_count++;
+
+      if (/^t (\d+) (.+)$/) { # Token timestamp update, cache resultant entries
+	my $tok = pack("H*",$2);
+	$tokens{$tok} = $1+0 if (!exists $tokens{$tok} || $1+0 > $tokens{$tok});
+      } elsif (/^c (-?\d+) (-?\d+) (\d+) (.+)$/) { # Add/full token update
+        my $tok = pack("H*",$4);
+        $self->tok_sync_counters ($1+0, $2+0, $3+0, $tok);
+        $count++;
+      } elsif (/^n (-?\d+) (-?\d+)$/) { # update ham/spam count
+	$self->tok_sync_nspam_nham ($1+0, $2+0);
+	$count++;
+      } elsif (/^m ([hsf]) (.+)$/) { # update msgid seen database
+	if ($1 eq "f") {
+	  $self->_seen_delete_direct($2);
+	}
+	else {
+	  $self->_seen_put_direct($2,$1);
+	}
+	$count++;
+      } else {
+	warn "osbf: gibberish entry found in journal: $_";
+      }
+    }
+    close JOURNAL;
+
+    # Now that we've determined what tokens we need to update and their
+    # final values, update the DB.  Should be much smaller than the full
+    # journal entries.
+    while (my ($k,$v) = each %tokens) {
+      $self->tok_touch_token ($v, $k);
+
+      if ((++$count % 1000) == 0) {
+	if ($showdots) { print STDERR "."; }
+	$self->set_running_expire_tok();
+      }
+    }
+
+    if ($showdots) { print STDERR "\n"; }
+
+    # we're all done, so unlink the old journal file
+    unlink ($retirepath) || warn "osbf: can't unlink $retirepath: $!\n";
+
+    $self->{db_toks}->{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = $started;
+
+    my $done = time();
+    my $msg = ("osbf: synced databases from journal in " .
+	       ($done - $started) .
+	       " seconds: $count unique entries ($total_count total entries)");
+
+    if ($opts->{verbose}) {
+      print $msg,"\n";
+    } else {
+      dbg($msg);
+    }
+  }
+
+  # else, that's the lot, we're synced.  return
+  return 1;
+}
+
+sub tok_touch_token {
+  my ($self, $atime, $tok) = @_;
+  my ($ts, $th, $oldatime) = $self->tok_get ($tok);
+
+  # If the new atime is < the old atime, ignore the update
+  # We figure that we'll never want to lower a token atime, so abort if
+  # we try.  (journal out of sync, etc.)
+  return if ($oldatime >= $atime);
+
+  $self->tok_put ($tok, $ts, $th, $atime);
+}
+
+sub tok_sync_counters {
+  my ($self, $ds, $dh, $atime, $tok) = @_;
+  my ($ts, $th, $oldatime) = $self->tok_get ($tok);
+  $ts += $ds; if ($ts < 0) { $ts = 0; }
+  $th += $dh; if ($th < 0) { $th = 0; }
+
+  # Don't roll the atime of tokens backwards ...
+  $atime = $oldatime if ($oldatime > $atime);
+
+  $self->tok_put ($tok, $ts, $th, $atime);
+}
+
+sub tok_put {
+  my ($self, $tok, $ts, $th, $atime) = @_;
+  $ts ||= 0;
+  $th ||= 0;
+
+  # Ignore magic tokens, the don't go in this way ...
+  return if ($tok =~ MAGIC_RE);
+
+  # use defined() rather than exists(); the latter is not supported
+  # by NDBM_File, believe it or not.  Using defined() did not
+  # indicate any noticeable speed hit in my testing. (Mar 31 2003 jm)
+  my $exists_already = defined $self->{db_toks}->{$tok};
+
+  if ($ts == 0 && $th == 0) {
+    return if (!$exists_already); # If the token doesn't exist, just return
+    $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN}--;
+    delete $self->{db_toks}->{$tok};
+  } else {
+    if (!$exists_already) { # If the token doesn't exist, raise the token count
+      $self->{db_toks}->{$NTOKENS_MAGIC_TOKEN}++;
+    }
+
+    $self->{db_toks}->{$tok} = $self->tok_pack ($ts, $th, $atime);
+
+    my $newmagic = $self->{db_toks}->{$NEWEST_TOKEN_AGE_MAGIC_TOKEN};
+    if (!defined ($newmagic) || $atime > $newmagic) {
+      $self->{db_toks}->{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $atime;
+    }
+
+    # Make sure to check for either !defined or "" ...  Apparently
+    # sometimes the DB module doesn't return the value correctly. :(
+    my $oldmagic = $self->{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN};
+    if (!defined ($oldmagic) || $oldmagic eq "" || $atime < $oldmagic) {
+      $self->{db_toks}->{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $atime;
+    }
+  }
+}
+
+sub tok_sync_nspam_nham {
+  my ($self, $ds, $dh) = @_;
+  my ($ns, $nh) = ($self->get_storage_variables())[1,2];
+  if ($ds) { $ns += $ds; } if ($ns < 0) { $ns = 0; }
+  if ($dh) { $nh += $dh; } if ($nh < 0) { $nh = 0; }
+  $self->{db_toks}->{$NSPAM_MAGIC_TOKEN} = $ns;
+  $self->{db_toks}->{$NHAM_MAGIC_TOKEN} = $nh;
+}
+
+###########################################################################
+
+sub _get_journal_filename {
+  my ($self) = @_;
+
+  my $main = $self->{osbf}->{main};
+  return $main->sed_path($main->{conf}->{osbf_path}."_journal");
+}
+
+###########################################################################
+
+# this is called directly from sa-learn(1).
+sub perform_upgrade {
+  my ($self, $opts) = @_;
+  my $ret = 0;
+
+  my $eval_stat;
+  eval {
+    local $SIG{'__DIE__'};	# do not run user die() traps in here
+
+    use File::Basename;
+    use File::Copy;
+
+    # bayes directory
+    my $main = $self->{osbf}->{main};
+    my $path = $main->sed_path($main->{conf}->{osbf_path});
+    my $dir = dirname($path);
+
+    # make temporary copy since old dbm and new dbm may have same name
+    opendir(DIR, $dir) || die "osbf: can't opendir $dir: $!";
+    my @files = grep { /^osbf_(?:seen|toks)(?:\.\w+)?$/ } readdir(DIR);
+    closedir(DIR);
+    if (@files < 2 || !grep(/osbf_seen/,@files) || !grep(/osbf_toks/,@files))
+    {
+      die "osbf: unable to find osbf_toks and osbf_seen, stopping\n";
+    }
+    # untaint @files (already safe after grep)
+    untaint_var(\@files);
+ 	 
+    for (@files) {
+      my $src = "$dir/$_";
+      my $dst = "$dir/old_$_";
+      copy($src, $dst) || die "osbf: can't copy $src to $dst: $!\n";
+    }
+
+    # delete previous to make way for import
+    for (@files) { unlink("$dir/$_"); }
+
+    # import
+    if ($self->tie_db_writable()) {
+      $ret += $self->upgrade_old_dbm_files_trapped("$dir/old_osbf_seen",
+						   $self->{db_seen});
+      $ret += $self->upgrade_old_dbm_files_trapped("$dir/old_osbf_toks",
+						   $self->{db_toks});
+    }
+
+    if ($ret == 2) {
+      print "import successful, original files saved with \"old\" prefix\n";
+    }
+    else {
+      print "import failed, original files saved with \"old\" prefix\n";
+    }
+    1;
+  } or do {
+    $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat;
+  };
+
+  $self->untie_db();
+
+  # if we died, untie the dbm files
+  if (defined $eval_stat) {
+    warn "osbf: perform_upgrade: $eval_stat\n";
+    return 0;
+  }
+  $ret;
+}
+
+sub upgrade_old_dbm_files_trapped {
+  my ($self, $filename, $output) = @_;
+
+  my $count;
+  my %in;
+
+  print "upgrading to DB_File, please be patient: $filename\n";
+
+  # try each type of file until we find one with > 0 entries
+  for my $dbm ('DB_File', 'GDBM_File', 'NDBM_File', 'SDBM_File') {
+    $count = 0;
+    # wrap in eval so it doesn't run in general use.  This accesses db
+    # modules directly.
+    # Note: (bug 2390), the 'use' needs to be on the same line as the eval
+    # for RPM dependency checks to work properly.  It's lame, but...
+    my $eval_stat;
+    eval 'use ' . $dbm . ';
+      tie %in, "' . $dbm . '", $filename, O_RDONLY, 0600;
+      %{ $output } = %in;
+      $count = scalar keys %{ $output };
+      untie %in;
+      1;
+    ' or do {
+      $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat;
+    };
+    if (defined $eval_stat) {
+      print "$dbm: $dbm module not installed(?), nothing copied: $eval_stat\n";
+      dbg("osbf: error was: $eval_stat");
+    }
+    elsif ($count == 0) {
+      print "$dbm: no database of that kind found, nothing copied\n";
+    }
+    else {
+      print "$dbm: copied $count entries\n";
+      return 1;
+    }
+  }
+
+  return 0;
+}
+
+sub clear_database {
+  my ($self) = @_;
+
+  return 0 unless ($self->tie_db_writable());
+
+  dbg("osbf: untie-ing in preparation for removal.");
+
+  foreach my $dbname (@DBNAMES) {
+    my $db_var = 'db_'.$dbname;
+
+    if (exists $self->{$db_var}) {
+      # dbg("osbf: untie-ing $db_var");
+      untie %{$self->{$db_var}};
+      delete $self->{$db_var};
+    }
+  }
+
+  my $path = $self->{osbf}->{main}->sed_path($self->{osbf}->{main}->{conf}->{osbf_path});
+
+  foreach my $dbname (@DBNAMES, 'journal') {
+    foreach my $ext ($self->DB_EXTENSIONS) {
+      my $name = $path.'_'.$dbname.$ext;
+      my $ret = unlink $name;
+      dbg("osbf: clear_database: " . ($ret ? 'removed' : 'tried to remove') . " $name");
+    }
+  }
+
+  # the journal file needs to be done separately since it has no extension
+  foreach my $dbname ('journal') {
+    my $name = $path.'_'.$dbname;
+    my $ret = unlink $name;
+    dbg("osbf: clear_database: " . ($ret ? 'removed' : 'tried to remove') . " $name");
+  }
+
+  $self->untie_db();
+
+  return 1;
+}
+
+sub backup_database {
+  my ($self) = @_;
+
+  # we tie writable because we want the upgrade code to kick in if needed
+  return 0 unless ($self->tie_db_writable());
+
+  my @vars = $self->get_storage_variables();
+
+  print "v\t$vars[6]\tdb_version # this must be the first line!!!\n";
+  print "v\t$vars[1]\tnum_spam\n";
+  print "v\t$vars[2]\tnum_nonspam\n";
+
+  while (my ($tok, $packed) = each %{$self->{db_toks}}) {
+    next if ($tok =~ MAGIC_RE); # skip magic tokens
+
+    my ($ts, $th, $atime) = $self->tok_unpack($packed);
+    my $encoded_token = unpack("H*",$tok);
+    print "t\t$ts\t$th\t$atime\t$encoded_token\n";
+  }
+
+  while (my ($msgid, $flag) = each %{$self->{db_seen}}) {
+    print "s\t$flag\t$msgid\n";
+  }
+
+  $self->untie_db();
+
+  return 1;
+}
+
+sub restore_database {
+  my ($self, $filename, $showdots) = @_;
+
+  if (!open(DUMPFILE, '<', $filename)) {
+    dbg("osbf: unable to open backup file $filename: $!");
+    return 0;
+  }
+   
+  if (!$self->tie_db_writable()) {
+    dbg("osbf: failed to tie db writable");
+    return 0;
+  }
+
+  my $main = $self->{osbf}->{main};
+  my $path = $main->sed_path($main->{conf}->{osbf_path});
+
+  # use a temporary PID-based suffix just in case another one was
+  # created previously by an interrupted expire
+  my $tmpsuffix = "convert$$";
+  my $tmptoksdbname = $path.'_toks.'.$tmpsuffix;
+  my $tmpseendbname = $path.'_seen.'.$tmpsuffix;
+  my $toksdbname = $path.'_toks';
+  my $seendbname = $path.'_seen';
+
+  my %new_toks;
+  my %new_seen;
+  my $umask = umask 0;
+  unless (tie %new_toks, $self->DBM_MODULE, $tmptoksdbname, O_RDWR|O_CREAT|O_EXCL,
+	  (oct ($main->{conf}->{osbf_file_mode}) & 0666)) {
+    dbg("osbf: failed to tie temp toks db: $!");
+    $self->untie_db();
+    umask $umask;
+    return 0;
+  }
+  unless (tie %new_seen, $self->DBM_MODULE, $tmpseendbname, O_RDWR|O_CREAT|O_EXCL,
+	  (oct ($main->{conf}->{osbf_file_mode}) & 0666)) {
+    dbg("osbf: failed to tie temp seen db: $!");
+    untie %new_toks;
+    $self->_unlink_file($tmptoksdbname);
+    $self->untie_db();
+    umask $umask;
+    return 0;
+  }
+  umask $umask;
+
+  my $line_count = 0;
+  my $db_version;
+  my $token_count = 0;
+  my $num_spam;
+  my $num_ham;
+  my $error_p = 0;
+  my $newest_token_age = 0;
+  # Kinda wierd I know, but we need a nice big value and we know there will be
+  # no tokens > time() since we reset atime if > time(), so use that with a
+  # little buffer just in case.
+  my $oldest_token_age = time() + 100000;
+
+  my $line = <DUMPFILE>;
+  $line_count++;
+
+  # We require the database version line to be the first in the file so we can
+  # figure out how to properly deal with the file.  If it is not the first
+  # line then fail
+  if ($line =~ m/^v\s+(\d+)\s+db_version/) {
+    $db_version = $1;
+  }
+  else {
+    dbg("osbf: database version must be the first line in the backup file, correct and re-run");
+    untie %new_toks;
+    untie %new_seen;
+    $self->_unlink_file($tmptoksdbname);
+    $self->_unlink_file($tmpseendbname);
+    $self->untie_db();
+    return 0;
+  }
+
+  unless ($db_version == 2 || $db_version == 3) {
+    warn("osbf: database version $db_version is unsupported, must be version 2 or 3");
+    untie %new_toks;
+    untie %new_seen;
+    $self->_unlink_file($tmptoksdbname);
+    $self->_unlink_file($tmpseendbname);
+    $self->untie_db();
+    return 0;
+  }
+
+  while (my $line = <DUMPFILE>) {
+    chomp($line);
+    $line_count++;
+
+    if ($line_count % 1000 == 0) {
+      print STDERR "." if ($showdots);
+    }
+
+    if ($line =~ /^v\s+/) { # variable line
+      my @parsed_line = split(/\s+/, $line, 3);
+      my $value = $parsed_line[1] + 0;
+      if ($parsed_line[2] eq 'num_spam') {
+	$num_spam = $value;
+      }
+      elsif ($parsed_line[2] eq 'num_nonspam') {
+	$num_ham = $value;
+      }
+      else {
+	dbg("osbf: restore_database: skipping unknown line: $line");
+      }
+    }
+    elsif ($line =~ /^t\s+/) { # token line
+      my @parsed_line = split(/\s+/, $line, 5);
+      my $spam_count = $parsed_line[1] + 0;
+      my $ham_count = $parsed_line[2] + 0;
+      my $atime = $parsed_line[3] + 0;
+      my $token = $parsed_line[4];
+
+      my $token_warn_p = 0;
+      my @warnings;
+
+      if ($spam_count < 0) {
+	$spam_count = 0;
+	push(@warnings, 'spam count < 0, resetting');
+	$token_warn_p = 1;
+      }
+      if ($ham_count < 0) {
+	$ham_count = 0;
+	push(@warnings, 'ham count < 0, resetting');
+	$token_warn_p = 1;
+      }
+
+      if ($spam_count == 0 && $ham_count == 0) {
+	dbg("osbf: token has zero spam and ham count, skipping");
+	next;
+      }
+
+      if ($atime > time()) {
+	$atime = time();
+	push(@warnings, 'atime > current time, resetting');
+	$token_warn_p = 1;
+      }
+
+      if ($token_warn_p) {
+	dbg("osbf: token ($token) has the following warnings:\n".join("\n",@warnings));
+      }
+
+      # database versions < 3 did not encode their token values
+      if ($db_version < 3) {
+	$token = substr(sha1($token), -5);
+      }
+      else {
+	# turn unpacked binary token back into binary value
+	$token = pack("H*",$token);
+      }
+
+      $new_toks{$token} = $self->tok_pack($spam_count, $ham_count, $atime);
+      if ($atime < $oldest_token_age) {
+	$oldest_token_age = $atime;
+      }
+      if ($atime > $newest_token_age) {
+	$newest_token_age = $atime;
+      }
+      $token_count++;
+    }
+    elsif ($line =~ /^s\s+/) { # seen line
+      my @parsed_line = split(/\s+/, $line, 3);
+      my $flag = $parsed_line[1];
+      my $msgid = $parsed_line[2];
+
+      unless ($flag eq 'h' || $flag eq 's') {
+	dbg("osbf: unknown seen flag ($flag) for line: $line, skipping");
+	next;
+      }
+
+      unless ($msgid) {
+	dbg("osbf: blank msgid for line: $line, skipping");
+	next;
+      }
+
+      $new_seen{$msgid} = $flag;
+    }
+    else {
+      dbg("osbf: skipping unknown line: $line");
+      next;
+    }
+  }
+  close(DUMPFILE);
+
+  print STDERR "\n" if ($showdots);
+
+  unless (defined($num_spam)) {
+    dbg("osbf: unable to find num spam, please check file");
+    $error_p = 1;
+  }
+
+  unless (defined($num_ham)) {
+    dbg("osbf: unable to find num ham, please check file");
+    $error_p = 1;
+  }
+
+  if ($error_p) {
+    dbg("osbf: error(s) while attempting to load $filename, correct and re-run");
+
+    untie %new_toks;
+    untie %new_seen;
+    $self->_unlink_file($tmptoksdbname);
+    $self->_unlink_file($tmpseendbname);
+    $self->untie_db();
+    return 0;
+  }
+
+  # set the calculated magic tokens
+  $new_toks{$DB_VERSION_MAGIC_TOKEN} = $self->DB_VERSION();
+  $new_toks{$NTOKENS_MAGIC_TOKEN} = $token_count;
+  $new_toks{$NSPAM_MAGIC_TOKEN} = $num_spam;
+  $new_toks{$NHAM_MAGIC_TOKEN} = $num_ham;
+  $new_toks{$NEWEST_TOKEN_AGE_MAGIC_TOKEN} = $newest_token_age;
+  $new_toks{$OLDEST_TOKEN_AGE_MAGIC_TOKEN} = $oldest_token_age;
+
+  # go ahead and zero out these, chances are good that they are bogus anyway.
+  $new_toks{$LAST_EXPIRE_MAGIC_TOKEN} = 0;
+  $new_toks{$LAST_JOURNAL_SYNC_MAGIC_TOKEN} = 0;
+  $new_toks{$LAST_ATIME_DELTA_MAGIC_TOKEN} = 0;
+  $new_toks{$LAST_EXPIRE_REDUCE_MAGIC_TOKEN} = 0;
+
+  local $SIG{'INT'} = 'IGNORE';
+  local $SIG{'TERM'} = 'IGNORE';
+  local $SIG{'HUP'} = 'IGNORE' if (!Mail::SpamAssassin::Util::am_running_on_windows());
+
+  untie %new_toks;
+  untie %new_seen;
+  $self->untie_db();
+
+  # Here is where something can go horribly wrong and screw up the bayes
+  # database files.  If we are able to copy one and not the other then it
+  # will leave the database in an inconsistent state.  Since this is an
+  # edge case, and they're trying to replace the DB anyway we should be ok.
+  unless ($self->_rename_file($tmptoksdbname, $toksdbname)) {
+    dbg("osbf: error while renaming $tmptoksdbname to $toksdbname: $!");
+    return 0;
+  }
+  unless ($self->_rename_file($tmpseendbname, $seendbname)) {
+    dbg("osbf: error while renaming $tmpseendbname to $seendbname: $!");
+    dbg("osbf: database now in inconsistent state");
+    return 0;
+  }
+
+  dbg("osbf: parsed $line_count lines");
+  dbg("osbf: created database with $token_count tokens based on $num_spam spam messages and $num_ham ham messages");
+
+  return 1;
+}
+
+###########################################################################
+
+# token marshalling format for db_toks.
+
+# Since we may have many entries with few hits, especially thousands of hapaxes
+# (1-occurrence entries), use a flexible entry format, instead of simply "2
+# packed ints", to keep the memory and disk space usage down.  In my
+# 18k-message test corpus, only 8.9% have >= 8 hits in either counter, so we
+# can use a 1-byte representation for the other 91% of low-hitting entries
+# and save masses of space.
+
+# This looks like: XXSSSHHH (XX = format bits, SSS = 3 spam-count bits, HHH = 3
+# ham-count bits).  If XX in the first byte is 11, it's packed as this 1-byte
+# representation; otherwise, if XX in the first byte is 00, it's packed as
+# "CLL", ie. 1 byte and 2 32-bit "longs" in perl pack format.
+
+# Savings: roughly halves size of toks db, at the cost of a ~10% slowdown.
+
+use constant FORMAT_FLAG	=> 0xc0;	# 11000000
+use constant ONE_BYTE_FORMAT	=> 0xc0;	# 11000000
+use constant TWO_LONGS_FORMAT	=> 0x00;	# 00000000
+
+use constant ONE_BYTE_SSS_BITS	=> 0x38;	# 00111000
+use constant ONE_BYTE_HHH_BITS	=> 0x07;	# 00000111
+
+sub tok_unpack {
+  my ($self, $value) = @_;
+  $value ||= 0;
+
+  my ($packed, $atime) = unpack("CV", $value);
+
+  if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) {
+    return (($packed & ONE_BYTE_SSS_BITS) >> 3,
+		$packed & ONE_BYTE_HHH_BITS,
+		$atime || 0);
+  }
+  elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) {
+    my ($packed, $ts, $th, $atime) = unpack("CVVV", $value);
+    return ($ts || 0, $th || 0, $atime || 0);
+  }
+  # other formats would go here...
+  else {
+    warn "osbf: unknown packing format for bayes db, please re-learn: $packed";
+    return (0, 0, 0);
+  }
+}
+
+sub tok_pack {
+  my ($self, $ts, $th, $atime) = @_;
+  $ts ||= 0; $th ||= 0; $atime ||= 0;
+  if ($ts < 8 && $th < 8) {
+    return pack ("CV", ONE_BYTE_FORMAT | ($ts << 3) | $th, $atime);
+  } else {
+    return pack ("CVVV", TWO_LONGS_FORMAT, $ts, $th, $atime);
+  }
+}
+
+###########################################################################
+
+sub db_readable {
+  my ($self) = @_;
+  return $self->{already_tied};
+}
+
+sub db_writable {
+  my ($self) = @_;
+  return $self->{already_tied} && $self->{is_locked};
+}
+
+###########################################################################
+
+sub _unlink_file {
+  my ($self, $filename) = @_;
+
+  unlink $filename;
+}
+
+sub _rename_file {
+  my ($self, $sourcefilename, $targetfilename) = @_;
+
+  return 0 unless (rename($sourcefilename, $targetfilename));
+
+  return 1;
+}
+
+sub sa_die { Mail::SpamAssassin::sa_die(@_); }
+
+1;
Index: lib/Mail/SpamAssassin/Plugin/OSBF.pm
===================================================================
--- lib/Mail/SpamAssassin/Plugin/OSBF.pm	(.../trunk)	(revision 0)
+++ lib/Mail/SpamAssassin/Plugin/OSBF.pm	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -0,0 +1,1372 @@
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at:
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+
+=head1 NAME
+
+Mail::SpamAssassin::Plugin::OSBF - OSBF learning classifier
+
+=head1 DESCRIPTION
+
+This plugin implements a trained probabilistic classifier, using an algorithm
+based on Winnow, as described in section 2 of _Combining Winnow and Orthogonal
+Sparse Bigrams for Incremental Spam Filtering_, by Siefkes, Assis, Chhabra and
+Yerazunis:
+
+  http://www.siefkes.net/ie/winnow-spam.pdf
+  http://en.wikipedia.org/wiki/Winnow
+
+The tokenizer uses Orthogonal Sparse Bigrams, as described in that paper.
+
+The results are incorporated into SpamAssassin as the OSBF_* rules.
+
+=head1 METHODS
+
+=over 4
+
+=cut
+
+package Mail::SpamAssassin::Plugin::OSBF;
+
+use strict;
+use warnings;
+use bytes;
+use re 'taint';
+
+use Mail::SpamAssassin;
+use Mail::SpamAssassin::Plugin;
+use Mail::SpamAssassin::PerMsgStatus;
+use Mail::SpamAssassin::Logger;
+use Mail::SpamAssassin::Util qw(untaint_var);
+
+use Mail::SpamAssassin::Bayes::CombineNaiveBayes;
+
+use Digest::SHA1 qw(sha1 sha1_hex);
+
+our @ISA = qw(Mail::SpamAssassin::Plugin);
+
+use vars qw{
+  $IGNORED_HDRS
+  $MARK_PRESENCE_ONLY_HDRS
+  $OPPORTUNISTIC_LOCK_VALID
+};
+
+# Which headers should we scan for tokens?  Don't use all of them, as it's easy
+# to pick up spurious clues from some.  What we now do is use all of them
+# *less* these well-known headers; that way we can pick up spammers' tracking
+# headers (which are obviously not well-known in advance!).
+
+# Received is handled specially
+$IGNORED_HDRS = qr{(?: (?:X-)?Sender    # misc noise
+  |Delivered-To |Delivery-Date
+  |(?:X-)?Envelope-To
+  |X-MIME-Auto[Cc]onverted |X-Converted-To-Plain-Text
+
+  |Subject      # not worth a tiny gain vs. to db size increase
+
+  # Date: can provide invalid cues if your spam corpus is
+  # older/newer than ham
+  |Date
+
+  # List headers: ignore. a spamfiltering mailing list will
+  # become a nonspam sign.
+  |X-List|(?:X-)?Mailing-List
+  |(?:X-)?List-(?:Archive|Help|Id|Owner|Post|Subscribe
+    |Unsubscribe|Host|Id|Manager|Admin|Comment
+    |Name|Url)
+  |X-Unsub(?:scribe)?
+  |X-Mailman-Version |X-Been[Tt]here |X-Loop
+  |Mail-Followup-To
+  |X-eGroups-(?:Return|From)
+  |X-MDMailing-List
+  |X-XEmacs-List
+
+  # gatewayed through mailing list (thanks to Allen Smith)
+  |(?:X-)?Resent-(?:From|To|Date)
+  |(?:X-)?Original-(?:From|To|Date)
+
+  # Spamfilter/virus-scanner headers: too easy to chain from
+  # these
+  |X-MailScanner(?:-SpamCheck)?
+  |X-Spam(?:-(?:Status|Level|Flag|Report|Hits|Score|Checker-Version))?
+  |X-Antispam |X-RBL-Warning |X-Mailscanner
+  |X-MDaemon-Deliver-To |X-Virus-Scanned
+  |X-Mass-Check-Id
+  |X-Pyzor |X-DCC-\S{2,25}-Metrics
+  |X-Filtered-B[Yy] |X-Scanned-By |X-Scanner
+  |X-AP-Spam-(?:Score|Status) |X-RIPE-Spam-Status
+  |X-SpamCop-[^:]+
+  |X-SMTPD |(?:X-)?Spam-Apparently-To
+  |SPAM |X-Perlmx-Spam
+  |X-Bogosity
+
+  # some noisy Outlook headers that add no good clues:
+  |Content-Class |Thread-(?:Index|Topic)
+  |X-Original[Aa]rrival[Tt]ime
+
+  # Annotations from IMAP, POP, and MH:
+  |(?:X-)?Status |X-Flags |Replied |Forwarded
+  |Lines |Content-Length
+  |X-UIDL? |X-IMAPbase
+
+  # Annotations from Bugzilla
+  |X-Bugzilla-[^:]+
+
+  # Annotations from VM: (thanks to Allen Smith)
+  |X-VM-(?:Bookmark|(?:POP|IMAP)-Retrieved|Labels|Last-Modified
+    |Summary-Format|VHeader|v\d-Data|Message-Order)
+
+  # Annotations from Gnus:
+  | X-Gnus-Mail-Source
+  | Xref
+
+)}x;
+
+# Note only the presence of these headers, in order to reduce the
+# hapaxen they generate.
+$MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face
+  |X-(?:Gnu-?PG|PGP|GPG)(?:-Key)?-Fingerprint
+)}ix;
+
+# tweaks tested as of Nov 18 2002 by jm: see SpamAssassin-devel list archives
+# for results.  The winners are now the default settings.
+use constant IGNORE_TITLE_CASE => 1;
+use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 1;
+use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
+
+# tweaks of May 12 2003, see SpamAssassin-devel archives again.
+use constant PRE_CHEW_ADDR_HEADERS => 1;
+use constant CHEW_BODY_URIS => 1;
+use constant CHEW_BODY_MAILADDRS => 1;
+use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
+use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
+use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0;
+use constant IGNORE_MSGID_TOKENS => 0;
+
+# tweaks of 12 March 2004, see bug 2129.
+use constant DECOMPOSE_BODY_TOKENS => 1;
+use constant MAP_HEADERS_MID => 1;
+use constant MAP_HEADERS_FROMTOCC => 1;
+use constant MAP_HEADERS_USERAGENT => 1;
+
+# tweaks, see http://issues.apache.org/SpamAssassin/show_bug.cgi?id=3173#c26
+use constant ADD_INVIZ_TOKENS_I_PREFIX => 1;
+use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0;
+
+# How many seconds should the opportunistic_expire lock be valid?
+$OPPORTUNISTIC_LOCK_VALID = 300;
+
+# How many of the most significant tokens should we use for the p(w)
+# calculation?
+use constant N_SIGNIFICANT_TOKENS => 999;
+
+# How many significant tokens are required for a classifier score to
+# be considered usable?
+use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1;
+
+###########################################################################
+
+sub new {
+  my $class = shift;
+  my ($main) = @_;
+
+  $class = ref($class) || $class;
+  my $self = $class->SUPER::new($main);
+  bless ($self, $class);
+
+  $self->{main} = $main;
+  $self->{conf} = $main->{conf};
+  $self->{use_ignores} = 1;
+
+  $self->register_eval_rule("check_osbf");
+  $self;
+}
+
+sub finish {
+  my $self = shift;
+  if ($self->{store}) {
+    $self->{store}->untie_db();
+  }
+  %{$self} = ();
+}
+
+# Plugin hook.
+# Return this implementation object, for callers that need to know
+# it.  TODO: callers shouldn't *need* to know it! 
+#
+# used in test suite to get access to {store}, internal APIs;
+# used in Mail::SpamAssassin::PerMsgStatus for the
+# compute_declassification_distance() call.
+#
+sub learner_get_implementation {      
+  my ($self) = @_;
+  return $self;
+}
+
+###########################################################################
+
+sub check_osbf {
+  my ($self, $pms, $fulltext, $min, $max) = @_;
+
+  return 0 if (!$pms->{conf}->{use_learner});
+  return 0 if (!$pms->{conf}->{use_bayes} || !$pms->{conf}->{use_bayes_rules});
+
+  # TODO: osbf_score?
+
+  if (!exists ($pms->{bayes_score})) {
+    my $timer = $self->{main}->time_method("check_osbf");
+    $pms->{bayes_score} = $self->scan($pms, $pms->{msg});
+  }
+
+  if (defined $pms->{bayes_score} &&
+      ($min == 0 || $pms->{bayes_score} > $min) &&
+      ($max eq "undef" || $pms->{bayes_score} <= $max))
+  {
+      if ($pms->{conf}->{detailed_bayes_score}) {
+        $pms->test_log(sprintf ("score: %3.4f, hits: %s",
+                                 $pms->{bayes_score},
+                                 $pms->{bayes_hits}));
+      }
+      else {
+        $pms->test_log(sprintf ("score: %3.4f", $pms->{bayes_score}));
+      }
+      return 1;
+  }
+
+  return 0;
+}
+
+###########################################################################
+
+# Plugin hook.
+sub learner_close {
+  my ($self, $params) = @_;
+  if ($self->{store}->db_readable()) { $self->{store}->untie_db(); }
+}
+
+###########################################################################
+
+sub ignore_message {
+  my ($self,$PMS) = @_;
+
+  return 0 unless $self->{use_ignores};
+
+  my $ig_from = $self->{main}->call_plugins ("check_wb_list",
+        { permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' });
+  my $ig_to = $self->{main}->call_plugins ("check_wb_list",
+        { permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' });
+
+  my $ignore = $ig_from || $ig_to;
+
+  dbg("osbf: not using bayes, bayes_ignore_from or _to rule") if $ignore;
+
+  return $ignore;
+}
+
+###########################################################################
+
+# Plugin hook.
+sub learn_message {
+  my ($self, $params) = @_;
+  my $isspam = $params->{isspam};
+  my $msg = $params->{msg};
+  my $id = $params->{id};
+
+  if (!$self->{conf}->{use_bayes}) { return; }
+
+  # Winnow cannot support learning to journal
+  $self->{main}->{learn_to_journal} = 0;
+
+  my $msgdata = $self->get_body_from_msg ($msg);
+  my $ret;
+
+  eval {
+    local $SIG{'__DIE__'};	# do not run user die() traps in here
+
+    my $ok;
+    if ($self->{main}->{learn_to_journal}) {
+      # If we're going to learn to journal, we'll try going r/o first...
+      # If that fails for some reason, let's try going r/w.  This happens
+      # if the DB doesn't exist yet.
+      $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();
+    } else {
+      $ok = $self->{store}->tie_db_writable();
+    }
+
+    if ($ok) {
+      $ret = $self->_learn_trapped ($isspam, $msg, $msgdata, $id);
+
+      if (!$self->{main}->{learn_caller_will_untie}) {
+        $self->{store}->untie_db();
+      }
+    }
+    1;
+  } or do {		# if we died, untie the dbs.
+    my $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat;
+    $self->{store}->untie_db();
+    die "osbf: (in learn) $eval_stat\n";
+  };
+
+  return $ret;
+}
+
+# this function is trapped by the wrapper above
+sub _learn_trapped {
+  my ($self, $isspam, $msg, $msgdata, $msgid) = @_;
+  my @msgid = ( $msgid );
+
+  if (!defined $msgid) {
+    @msgid = $self->get_msgid($msg);
+  }
+
+  foreach $msgid ( @msgid ) {
+    my $seen = $self->{store}->seen_get ($msgid);
+
+    if (defined ($seen)) {
+      if (($seen =~ /^s/ && $isspam) || ($seen =~ /^h/ && !$isspam)) {
+        dbg("osbf: $msgid already learnt correctly, not learning twice");
+        return 0;
+      } elsif ($seen !~ /^[hs]/) {
+        warn("osbf: db_seen corrupt: value='$seen' for $msgid, ignored");
+      } else {
+        # bug 3704: If the message was already learned, don't try learning it again.
+        # this prevents, for instance, manually learning as spam, then autolearning
+        # as ham, or visa versa.
+        if ($self->{main}->{learn_no_relearn}) {
+	  dbg("osbf: $msgid already learnt as opposite, not re-learning");
+	  return 0;
+	}
+
+        dbg("osbf: $msgid already learnt as opposite, forgetting first");
+
+        # kluge so that forget() won't untie the db on us ...
+        my $orig = $self->{main}->{learn_caller_will_untie};
+        $self->{main}->{learn_caller_will_untie} = 1;
+
+        my $fatal = !defined $self->{main}->{bayes_scanner}->forget ($msg);
+
+        # reset the value post-forget() ...
+        $self->{main}->{learn_caller_will_untie} = $orig;
+    
+        # forget() gave us a fatal error, so propagate that up
+        if ($fatal) {
+          dbg("osbf: forget() returned a fatal error, so learn() will too");
+	  return;
+        }
+      }
+
+      # we're only going to have seen this once, so stop if it's been
+      # seen already
+      last;
+    }
+  }
+
+  # Now that we're sure we haven't seen this message before ...
+  $msgid = $msgid[0];
+
+  if ($isspam) {
+    $self->{store}->nspam_nham_change (1, 0);
+  } else {
+    $self->{store}->nspam_nham_change (0, 1);
+  }
+
+  my $msgatime = $msg->receive_date();
+
+  # If the message atime comes back as being more than 1 day in the
+  # future, something's messed up and we should revert to current time as
+  # a safety measure.
+  #
+  $msgatime = time if ( $msgatime - time > 86400 );
+
+  # we don't use the weights when training
+  my $tokens = $self->tokenize($msg, $msgdata);
+
+  if ($isspam) {
+    $self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime);
+  } else {
+    $self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime);
+  }
+
+  $self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h'));
+  $self->{store}->cleanup();
+
+  $self->{main}->call_plugins("bayes_learn", { toksref => $tokens,
+					       isspam => $isspam,
+					       msgid => $msgid,
+					       msgatime => $msgatime,
+					     });
+
+  dbg("osbf: learned '$msgid', atime: $msgatime");
+
+  1;
+}
+
+###########################################################################
+
+# Plugin hook.
+sub forget_message {
+  my ($self, $params) = @_;
+  my $msg = $params->{msg};
+  my $id = $params->{id};
+
+  if (!$self->{conf}->{use_bayes}) { return; }
+
+  my $msgdata = $self->get_body_from_msg ($msg);
+  my $ret;
+
+  # we still tie for writing here, since we write to the seen db
+  # synchronously
+  eval {
+    local $SIG{'__DIE__'};	# do not run user die() traps in here
+
+    my $ok;
+    if ($self->{main}->{learn_to_journal}) {
+      # If we're going to learn to journal, we'll try going r/o first...
+      # If that fails for some reason, let's try going r/w.  This happens
+      # if the DB doesn't exist yet.
+      $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();
+    } else {
+      $ok = $self->{store}->tie_db_writable();
+    }
+
+    if ($ok) {
+      $ret = $self->_forget_trapped ($msg, $msgdata, $id);
+
+      if (!$self->{main}->{learn_caller_will_untie}) {
+        $self->{store}->untie_db();
+      }
+    }
+    1;
+  } or do {		# if we died, untie the dbs.
+    my $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat;
+    $self->{store}->untie_db();
+    die "osbf: (in forget) $eval_stat\n";
+  };
+
+  return $ret;
+}
+
+# this function is trapped by the wrapper above
+sub _forget_trapped {
+  my ($self, $msg, $msgdata, $msgid) = @_;
+  my @msgid = ( $msgid );
+  my $isspam;
+
+  if (!defined $msgid) {
+    @msgid = $self->get_msgid($msg);
+  }
+
+  while( $msgid = shift @msgid ) {
+    my $seen = $self->{store}->seen_get ($msgid);
+
+    if (defined ($seen)) {
+      if ($seen eq 's') {
+        $isspam = 1;
+      } elsif ($seen eq 'h') {
+        $isspam = 0;
+      } else {
+        dbg("osbf: forget: msgid $msgid seen entry is neither ham nor spam, ignored");
+        return 0;
+      }
+
+      # messages should only be learned once, so stop if we find a msgid
+      # which was seen before
+      last;
+    }
+    else {
+      dbg("osbf: forget: msgid $msgid not learnt, ignored");
+    }
+  }
+
+  # This message wasn't learnt before, so return
+  if (!defined $isspam) {
+    dbg("osbf: forget: no msgid from this message has been learnt, skipping message");
+    return 0;
+  }
+  elsif ($isspam) {
+    $self->{store}->nspam_nham_change (-1, 0);
+  }
+  else {
+    $self->{store}->nspam_nham_change (0, -1);
+  }
+
+  # we don't use the weights when training
+  my $tokens = $self->tokenize($msg, $msgdata);
+
+  if ($isspam) {
+    $self->{store}->multi_tok_count_change (-1, 0, $tokens);
+  } else {
+    $self->{store}->multi_tok_count_change (0, -1, $tokens);
+  }
+
+  $self->{store}->seen_delete ($msgid);
+  $self->{store}->cleanup();
+
+  $self->{main}->call_plugins("bayes_forget", { toksref => $tokens,
+						isspam => $isspam,
+						msgid => $msgid,
+					      });
+
+  1;
+}
+
+###########################################################################
+
+# Plugin hook.
+sub learner_sync {
+  my ($self, $params) = @_;
+  if (!$self->{conf}->{use_bayes}) { return 0; }
+  dbg("osbf: osbf journal sync starting");
+  $self->{store}->sync($params);
+  dbg("osbf: osbf journal sync completed");
+}
+
+###########################################################################
+
+# Plugin hook.
+sub learner_expire_old_training {
+  my ($self, $params) = @_;
+  if (!$self->{conf}->{use_bayes}) { return 0; }
+  dbg("osbf: expiry starting");
+  $self->{store}->expire_old_tokens($params);
+  dbg("osbf: expiry completed");
+}
+
+###########################################################################
+
+# Plugin hook.
+# Check to make sure we can tie() the DB, and we have enough entries to do a scan
+# if we're told the caller will untie(), go ahead and leave the db tied.
+sub learner_is_scan_available {
+  my ($self, $params) = @_;
+
+  return 0 unless $self->{conf}->{use_bayes};
+  return 0 unless $self->{store}->tie_db_readonly();
+
+  # We need the DB to stay tied, so if the journal sync occurs, don't untie!
+  my $caller_untie = $self->{main}->{learn_caller_will_untie};
+  $self->{main}->{learn_caller_will_untie} = 1;
+
+  # Do a journal sync if necessary.  Do this before the nspam_nham_get()
+  # call since the sync may cause an update in the number of messages
+  # learnt.
+  $self->_opportunistic_calls(1);
+
+  # Reset the variable appropriately
+  $self->{main}->{learn_caller_will_untie} = $caller_untie;
+
+  my ($ns, $nn) = $self->{store}->nspam_nham_get();
+
+  if ($ns < $self->{conf}->{bayes_min_spam_num}) {
+    dbg("osbf: not available for scanning, only $ns spam(s) in osbf DB < ".$self->{conf}->{bayes_min_spam_num});
+    if (!$self->{main}->{learn_caller_will_untie}) {
+      $self->{store}->untie_db();
+    }
+    return 0;
+  }
+  if ($nn < $self->{conf}->{bayes_min_ham_num}) {
+    dbg("osbf: not available for scanning, only $nn ham(s) in osbf DB < ".$self->{conf}->{bayes_min_ham_num});
+    if (!$self->{main}->{learn_caller_will_untie}) {
+      $self->{store}->untie_db();
+    }
+    return 0;
+  }
+
+  return 1;
+}
+
+###########################################################################
+
+sub scan {
+  my ($self, $permsgstatus, $msg) = @_;
+  my $score;
+
+  return unless $self->{conf}->{use_learner};
+
+  # When we're doing a scan, we'll guarantee that we'll do the untie,
+  # so override the global setting until we're done.
+  my $caller_untie = $self->{main}->{learn_caller_will_untie};
+  $self->{main}->{learn_caller_will_untie} = 1;
+
+  goto skip if ($self->{main}->{bayes_scanner}->ignore_message($permsgstatus));
+  goto skip unless $self->learner_is_scan_available();
+
+  my ($ns, $nn) = $self->{store}->nspam_nham_get();
+  dbg("osbf: corpus size: nspam = $ns, nham = $nn");
+
+  my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
+  my ($msgtokens, $tokweights) = $self->tokenize($msg, $msgdata);
+  my $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
+
+  ## my $keep_pw = $self->{main}->have_plugin("bayes_scan");
+  my %pw;
+
+  foreach my $tokendata (@{$tokensdata}) {
+    my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
+
+    my $prob = $self->_compute_prob_for_token($token, $ns, $nn,
+                $tok_spam, $tok_ham, $tokweights->{$token});
+# dbg("osbf: JMD token '$msgtokens->{$token}' => ".(defined $prob ? (sprintf "%3.5f", $prob) : '??'));
+    next unless defined $prob;
+
+    $pw{$token} = {
+      prob => $prob,
+      spam_count => $tok_spam,
+      ham_count => $tok_ham,
+      atime => $atime
+    };
+  }
+
+  # If none of the tokens were found in the DB, we're going to skip
+  # this message...
+  if (!keys %pw) {
+    dbg("bayes: cannot use osbf on this message; none of the tokens were found in the database");
+    goto skip;
+  }
+
+  my $tcount_total = keys %{$msgtokens};
+  my $tcount_learned = keys %pw;
+
+  # Figure out the message receive time (used as atime below)
+  # If the message atime comes back as being in the future, something's
+  # messed up and we should revert to current time as a safety measure.
+  #
+  my $msgatime = $msg->receive_date();
+  my $now = time;
+  $msgatime = $now if ( $msgatime > $now );
+
+  # now take the $count most significant tokens and calculate probs using
+  # Robinson's formula.
+  my $count = N_SIGNIFICANT_TOKENS;
+  my @sorted;
+
+  my @touch_tokens;
+  my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];
+  my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];
+
+  my %tok_strength = map { $_ => (abs($pw{$_}->{prob} - 0.5)) } keys %pw;
+  my $log_each_token = (would_log('dbg', 'osbf') > 1);
+
+  foreach my $tok (
+        sort { $tok_strength{$b} <=> $tok_strength{$a} }
+        keys %pw)
+  {
+    if ($count-- < 0) { last; }
+    # next if ($tok_strength{$tok} <
+    # $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH);
+
+    my $pw = $pw{$tok}->{prob};
+
+    # What's more expensive, scanning headers for HAMMYTOKENS and
+    # SPAMMYTOKENS tags that aren't there or collecting data that
+    # won't be used?  Just collecting the data is certainly simpler.
+    #
+    my $raw_token = $msgtokens->{$tok} || "(unknown)";
+    my $s = $pw{$tok}->{spam_count};
+    my $n = $pw{$tok}->{ham_count};
+    my $a = $pw{$tok}->{atime};
+
+    if (0) {    # TODO
+    if ($pw < 0.5) {
+      push @$tinfo_hammy,  [$raw_token,$pw,$s,$n,$a];
+    } else {
+      push @$tinfo_spammy, [$raw_token,$pw,$s,$n,$a];
+    }
+    }
+
+    push (@sorted, $pw);
+
+    # update the atime on this token, it proved useful
+    push(@touch_tokens, $tok);
+
+    # dbg("osbf: JMD token '$raw_token' => ".(sprintf "%3.5f", $pw));
+    if ($log_each_token) {
+      dbg("osbf: token '$raw_token' => ".(sprintf "%3.5f", $pw));
+    }
+  }
+
+  if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 &&
+        $#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE))
+  {
+    dbg("osbf: cannot use osbf on this message; not enough usable tokens found");
+    goto skip;
+  }
+
+  $score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted);
+
+  # Couldn't come up with a probability?
+  goto skip unless defined $score;
+
+  # HACK HACK scale to a better range, since EDDC seems to bias low
+  $score *= 1.1; $score = 1.0 if $score > 1.0;
+
+  dbg("osbf: score = $score");
+
+  # no need to call tok_touch_all unless there were significant
+  # tokens and a score was returned
+  # we don't really care about the return value here
+  $self->{store}->tok_touch_all(\@touch_tokens, $msgatime);
+
+  $permsgstatus->{bayes_nspam} = $ns;
+  $permsgstatus->{bayes_nham} = $nn;
+
+  $self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens,
+                                              probsref => \%pw,
+                                              score => $score,
+                                              msgatime => $msgatime,
+                                              significant_tokens => \@touch_tokens,
+                                            });
+
+skip:
+  if (!defined $score) {
+    dbg("osbf: not scoring message, returning undef");
+  }
+
+  # Take any opportunistic actions we can take
+  if ($self->{main}->{opportunistic_expire_check_only}) {
+    # we're supposed to report on expiry only -- so do the
+    # _opportunistic_calls() run for the journal only.
+    $self->_opportunistic_calls(1);
+    $permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due();
+  }
+  else {
+    $self->_opportunistic_calls();
+  }
+
+  # Do any cleanup we need to do
+  $self->{store}->cleanup();
+
+  # Reset the value accordingly
+  $self->{main}->{learn_caller_will_untie} = $caller_untie;
+
+  # If our caller won't untie the db, we need to do it.
+  if (!$caller_untie) {
+    $self->{store}->untie_db();
+  }
+
+  $permsgstatus->{tag_data}{BAYESTCHAMMY} =
+                        ($tinfo_hammy ? scalar @{$tinfo_hammy} : 0);
+  $permsgstatus->{tag_data}{BAYESTCSPAMMY} =
+                        ($tinfo_spammy ? scalar @{$tinfo_spammy} : 0);
+  $permsgstatus->{tag_data}{BAYESTCLEARNED} = $tcount_learned;
+  $permsgstatus->{tag_data}{BAYESTC} = $tcount_total;
+
+  return $score;
+}
+
+###########################################################################
+
+# Plugin hook.
+sub learner_dump_database {
+  my ($self, $params) = @_;
+  my $magic = $params->{magic};
+  my $toks = $params->{toks};
+  my $regex = $params->{regex};
+
+  # allow dump to occur even if use_bayes disables everything else ...
+  #return 0 unless $self->{conf}->{use_bayes};
+  return 0 unless $self->{store}->tie_db_readonly();
+  
+  my @vars = $self->{store}->get_storage_variables();
+
+  my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars;
+
+  my $template = '%3.3f %10u %10u %10u  %s'."\n";
+
+  if ( $magic ) {
+    printf ($template, 0.0, 0, $bv, 0, 'non-token data: osbf db version');
+    printf ($template, 0.0, 0, $ns, 0, 'non-token data: nspam');
+    printf ($template, 0.0, 0, $nh, 0, 'non-token data: nham');
+    printf ($template, 0.0, 0, $nt, 0, 'non-token data: ntokens');
+    printf ($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime');
+    printf ($template, 0.0, 0, $na, 0, 'non-token data: newest atime') if ( $bv >= 2 );
+    printf ($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count') if ( $bv < 2 );
+    printf ($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime') if ( $bv >= 2 );
+    printf ($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime');
+    if ( $bv >= 2 ) {
+      printf ($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta');
+      printf ($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count');
+    }
+  }
+
+  if ( $toks ) {
+    # let the store sort out the db_toks
+    $self->{store}->dump_db_toks($template, $regex, @vars);
+  }
+
+  if (!$self->{main}->{learn_caller_will_untie}) {
+    $self->{store}->untie_db();
+  }
+  return 1;
+}
+
+###########################################################################
+# TODO: these are NOT public, but the test suite needs to call them.
+
+sub get_msgid {
+  my ($self, $msg) = @_;
+
+  my @msgid;
+
+  my $msgid = $msg->get_header("Message-Id");
+  if (defined $msgid && $msgid ne '' && $msgid !~ /^\s*<\s*(?:\@sa_generated)?>.*$/) {
+    # remove \r and < and > prefix/suffixes
+    chomp $msgid;
+    $msgid =~ s/^<//; $msgid =~ s/>.*$//g;
+    push(@msgid, $msgid);
+  }
+
+  # Use sha1_hex(Date:, last received: and top N bytes of body)
+  # where N is MIN(1024 bytes, 1/2 of body length)
+  #
+  my $date = $msg->get_header("Date");
+  $date = "None" if (!defined $date || $date eq ''); # No Date?
+
+  my @rcvd = $msg->get_header("Received");
+  my $rcvd = $rcvd[$#rcvd];
+  $rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?
+
+  # Make a copy since pristine_body is a reference ...
+  my $body = join('', $msg->get_pristine_body());
+  if (length($body) > 64) { # Small Body?
+    my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );
+    substr($body, $keep) = '';
+  }
+
+  unshift(@msgid, sha1_hex($date."\000".$rcvd."\000".$body).'@sa_generated');
+
+  return wantarray ? @msgid : $msgid[0];
+}
+
+sub get_body_from_msg {
+  my ($self, $msg) = @_;
+
+  if (!ref $msg) {
+    # I have no idea why this seems to happen. TODO
+    warn "osbf: msg not a ref: '$msg'";
+    return { };
+  }
+
+  my $permsgstatus =
+        Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg);
+  $msg->extract_message_metadata ($permsgstatus);
+  my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
+  $permsgstatus->finish();
+
+  if (!defined $msgdata) {
+    # why?!
+    warn "osbf: failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n";
+    return { };
+  }
+
+  return $msgdata;
+}
+
+sub _get_msgdata_from_permsgstatus {
+  my ($self, $msg) = @_;
+
+  my $msgdata = { };
+  $msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array();
+  $msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array();
+  @{$msgdata->{bayes_token_uris}} = $msg->get_uri_list();
+  return $msgdata;
+}
+
+###########################################################################
+
+# The calling functions expect a uniq'ed array of tokens ...
+sub tokenize {
+  my ($self, $msg, $msgdata) = @_;
+
+  # the body
+  my @tokens = map { $self->_tokenize_line ($_, '', 1) }
+                                    @{$msgdata->{bayes_token_body}};
+
+  # the URI list
+  push (@tokens, map { $self->_tokenize_line ($_, '', 2) }
+                                    @{$msgdata->{bayes_token_uris}});
+
+  # add invisible tokens
+  if (ADD_INVIZ_TOKENS_I_PREFIX) {
+    push (@tokens, map { $self->_tokenize_line ($_, "I*:", 1) }
+                                    @{$msgdata->{bayes_token_inviz}});
+  }
+  if (ADD_INVIZ_TOKENS_NO_PREFIX) {
+    push (@tokens, map { $self->_tokenize_line ($_, "", 1) }
+                                    @{$msgdata->{bayes_token_inviz}});
+  }
+
+  # Tokenize the headers
+  my %hdrs = $self->_tokenize_headers ($msg);
+  while( my($prefix, $value) = each %hdrs ) {
+    push(@tokens, $self->_tokenize_line ($value, "H$prefix:", 0));
+  }
+
+  # Go ahead and uniq the array, skip null tokens (can happen sometimes)
+  # generate an SHA1 hash and take the lower 40 bits as our token
+  my %tokens;
+  my %weights;
+  foreach my $token (@tokens) {
+    my $distance = '0';     # OSB bigram token distance; default to 0
+    if ($token && $token =~ s/^([0-5])://) {
+      $distance = $1;
+    }
+
+    next unless length($token); # skip still 0-length tokens
+    my $hash = substr(sha1($token), -5);
+    $tokens{$hash} = $token;
+
+    # set the weight to be the lowest distance for that token
+    if (!(defined $weights{$hash})
+          || ($weights{$hash} > $distance))
+    {
+      $weights{$hash} = $distance;
+    }
+  }
+
+  # return the keys == tokens ...
+  return (\%tokens, \%weights);
+}
+
+sub _tokenize_line {
+  my $self = $_[0];
+  my $tokprefix = $_[2];
+  my $region = $_[3];
+
+  my @rettokens;
+  my $magic_re = $self->{store}->get_magic_re();
+  my ($w1,$w2,$w3,$w4,$w5) = ('','','','','');
+
+  my @words = ($_[1] =~
+      /([^\p{Z}\p{C}][\/!?#]?[-\p{L}\p{M}\p{N}]*(?:['"=;]|\/?>|:\/*)?)/g);
+  foreach my $token (@words)
+  {
+    next if ($token =~ /^[\.\,]+$/);    # just punctuation
+    # $token =~ s/^[-'"\.,]+//;        # trim non-alphanum chars at start or end
+    # $token =~ s/[-'"\.,]+$//;        # so we don't get loads of '"foo' tokens
+
+    # Skip false magic tokens
+    # TVD: we need to do a defined() check since SQL doesn't have magic
+    # tokens, so the SQL BayesStore returns undef.  I really want a way
+    # of optimizing that out, but I haven't come up with anything yet.
+    #
+    next if ( defined $magic_re && $token =~ /$magic_re/ );
+
+    # are we in the body?  If so, apply some body-specific breakouts
+    if ($region == 1 || $region == 2) {
+      if (0 && CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) {
+	push (@rettokens, $self->_tokenize_mail_addrs ($token));
+      }
+      elsif (0 && CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) {
+	push (@rettokens, "UD:".$token); # the full token
+	my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) {
+	  push (@rettokens, "UD:".$1); # UD = URL domain
+	}
+      }
+    }
+
+    # decompose tokens?  do this after shortening long tokens
+    if ($region == 1 || $region == 2) {
+      if (0 && DECOMPOSE_BODY_TOKENS) {
+        if ($token =~ /[^\w:\*]/) {
+          my $decompd = $token;                        # "Foo!"
+          $decompd =~ s/[^\w:\*]//gs;
+          push (@rettokens, $tokprefix.$decompd);      # "Foo"
+        }
+
+        if ($token =~ /[A-Z]/) {
+          my $decompd = $token; $decompd = lc $decompd;
+          push (@rettokens, $tokprefix.$decompd);      # "foo!"
+
+          if ($token =~ /[^\w:\*]/) {
+            $decompd =~ s/[^\w:\*]//gs;
+            push (@rettokens, $tokprefix.$decompd);    # "foo"
+          }
+        }
+      }
+    }
+
+    $w5 = $w4;
+    $w4 = $w3;
+    $w3 = $w2;
+    $w2 = $w1;
+    $w1 = $tokprefix.$token;
+
+    # here's the OSB (orthogonal sparse bigrams) part.
+    # record intra-token distance for weighting, too
+    push (@rettokens, $w2.' '.$w1);
+    push (@rettokens, '1:'.$w3.' '.$w1);    
+    push (@rettokens, '2:'.$w4.' '.$w1);
+    push (@rettokens, '3:'.$w5.' '.$w1);
+  }
+
+  return @rettokens;
+}
+
+sub _tokenize_headers {
+  my ($self, $msg) = @_;
+
+  my %parsed;
+
+  my %user_ignore;
+  $user_ignore{lc $_} = 1 for @{$self->{main}->{conf}->{bayes_ignore_headers}};
+
+  # get headers in array context
+  my @hdrs;
+  my @rcvdlines;
+  for ($msg->get_all_headers()) {
+    # first, keep a copy of Received headers, so we can strip down to last 2
+    if (/^Received:/i) {
+      push(@rcvdlines, $_);
+      next;
+    }
+    # and now skip lines for headers we don't want (including all Received)
+    next if /^${IGNORED_HDRS}:/i;
+    next if IGNORE_MSGID_TOKENS && /^Message-ID:/i;
+    push(@hdrs, $_);
+  }
+  push(@hdrs, $msg->get_all_metadata());
+
+  # and re-add the last 2 received lines: usually a good source of
+  # spamware tokens and HELO names.
+  if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); }
+  if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); }
+
+  for (@hdrs) {
+    next unless /\S/;
+    my ($hdr, $val) = split(/:/, $_, 2);
+
+    # remove user-specified headers here, after Received, in case they
+    # want to ignore that too
+    next if exists $user_ignore{lc $hdr};
+
+    # Prep the header value
+    $val ||= '';
+    chomp($val);
+
+    # special tokenization for some headers:
+    if ($hdr =~ /^(?:|X-|Resent-)Message-Id$/i) {
+      $val = $self->_pre_chew_message_id ($val);
+    }
+    elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:|X-|Resent-)
+	(?:Return-Path|From|To|Cc|Reply-To|Errors-To|Mail-Followup-To|Sender)$/ix)
+    {
+      $val = $self->_pre_chew_addr_header ($val);
+    }
+    elsif ($hdr eq 'Received') {
+      $val = $self->_pre_chew_received ($val);
+    }
+    elsif ($hdr eq 'Content-Type') {
+      $val = $self->_pre_chew_content_type ($val);
+    }
+    elsif ($hdr eq 'MIME-Version') {
+      $val =~ s/1\.0//;		# totally innocuous
+    }
+    elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) {
+      $val = "1"; # just mark the presence, they create lots of hapaxen
+    }
+
+    if (MAP_HEADERS_MID) {
+      if ($hdr =~ /^(?:In-Reply-To|References|Message-ID)$/i) {
+        $parsed{"*MI"} = $val;
+      }
+    }
+    if (MAP_HEADERS_FROMTOCC) {
+      if ($hdr =~ /^(?:From|To|Cc)$/i) {
+        $parsed{"*Ad"} = $val;
+      }
+    }
+    if (MAP_HEADERS_USERAGENT) {
+      if ($hdr =~ /^(?:X-Mailer|User-Agent)$/i) {
+        $parsed{"*UA"} = $val;
+      }
+    }
+
+    if (exists $parsed{$hdr}) {
+      $parsed{$hdr} .= " ".$val;
+    } else {
+      $parsed{$hdr} = $val;
+    }
+    if (would_log('dbg', 'osbf') > 1) {
+      dbg("osbf: header tokens for $hdr = \"$parsed{$hdr}\"");
+    }
+  }
+
+  return %parsed;
+}
+
+sub _pre_chew_content_type {
+  my ($self, $val) = @_;
+
+  # hopefully this will retain good bits without too many hapaxen
+  if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) {
+    my $boundary = $1;
+    $boundary =~ s/[a-fA-F0-9]/H/gs;
+    # break up blocks of separator chars so they become their own tokens
+    $boundary =~ s/([-_\.=]+)/ $1 /gs;
+    $val .= $boundary;
+  }
+
+  # stop-list words for Content-Type header: these wind up totally gray
+  $val =~ s/\b(?:text|charset)\b//;
+
+  $val;
+}
+
+sub _pre_chew_message_id {
+  my ($self, $val) = @_;
+  # we can (a) get rid of a lot of hapaxen and (b) increase the token
+  # specificity by pre-parsing some common formats.
+
+  # Outlook Express format:
+  $val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$
+           ([0-9a-f]{4})[0-9a-f]{4}\$
+           ([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx;
+
+  # Exim:
+  $val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//;
+
+  # Sendmail:
+  $val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\.
+           [A-F0-9]{10,12}\@//gx;
+
+  # try to split Message-ID segments on probable ID boundaries. Note that
+  # Outlook message-ids seem to contain a server identifier ID in the last
+  # 8 bytes before the @.  Make sure this becomes its own token, it's a
+  # great spam-sign for a learning system!  Be sure to split on ".".
+  $val =~ s/[^_A-Za-z0-9]/ /g;
+  $val;
+}
+
+sub _pre_chew_received {
+  my ($self, $val) = @_;
+
+  # Thanks to Dan for these.  Trim out "useless" tokens; sendmail-ish IDs
+  # and valid-format RFC-822/2822 dates
+
+  $val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs;  # Sendmail
+  $val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs;  # Sendmail
+  $val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs;    # Sendmail
+  $val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim
+
+  $val =~ s/(?:(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)?
+           [0-3\s]?[0-9]\s
+           (?:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s
+           (?:19|20)?[0-9]{2}\s
+           [0-2][0-9](?:\:[0-5][0-9]){1,2}\s
+           (?:\s*\(|\)|\s*(?:[+-][0-9]{4})|\s*(?:UT|[A-Z]{2,3}T))*
+           //gx;
+
+  # IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for
+  # IPs in the 10 and 192.168 ranges, they gets lots of significant tokens
+  # (on both sides)
+  # also make a dup with the full IP, as fodder for
+  # bayes_dump_to_trusted_networks: "H*r:ip*aaa.bbb.ccc.ddd"
+  $val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{
+           if ($2 eq '10' || ($2 eq '192' && $3 eq '168')) {
+             $1.$2.$3.$4.
+		" ip*".$1.$2.$3.$4." ";
+           } else {
+             $1.$2.$3.
+		" ip*".$1.$2.$3.$4." ";
+           }
+         }gex;
+
+  # trim these: they turn out as the most common tokens, but with a
+  # prob of about .5.  waste of space!
+  $val =~ s/\b(?:with|from|for|SMTP|ESMTP)\b/ /g;
+
+  $val;
+}
+
+sub _pre_chew_addr_header {
+  my ($self, $val) = @_;
+  local ($_);
+
+  my @addrs = $self->{main}->find_all_addrs_in_line ($val);
+  my @toks;
+  foreach (@addrs) {
+    push (@toks, $self->_tokenize_mail_addrs ($_));
+  }
+  return join (' ', @toks);
+}
+
+sub _tokenize_mail_addrs {
+  my ($self, $addr) = @_;
+
+  ($addr =~ /(.+)\@(.+)$/) or return ();
+  my @toks;
+  push(@toks, "U*".$1, "D*".$2);
+  $_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); }
+  return @toks;
+}
+
+
+###########################################################################
+
+our $distance_weights = [ 3125, 256, 27, 4 ];
+
+# compute the probability that a token is spammish
+sub _compute_prob_for_token {
+  my ($self, $token, $ns, $nn, $s, $n, $distance) = @_;
+
+  # we allow the caller to give us the token information, just
+  # to save a potentially expensive lookup
+  if (!defined($s) || !defined($n)) {
+    ($s, $n, undef) = $self->{store}->tok_get ($token);
+  }
+  return if ($s == 0 && $n == 0);
+
+  # we can't do anything if we haven't trained on any ham/spam
+  return if ( $ns == 0 || $nn == 0 );   
+
+  # weighting for OSB sparse features
+  my $weight;
+  if (defined $distance) {
+    $weight = $distance_weights->[$distance];
+  }
+  $weight ||= $distance_weights->[0];
+
+  # apply the EDDC algorithm, specifically the part from section 4
+  # of the osbf-eddc.pdf paper
+
+  # normalized count of docs containing feature in spam/ham
+  my $NDfs = ($s * 10000) / $ns;
+  my $NDfh = ($n * 10000) / $nn;
+
+  # these values are as dictated in the EDDC paper, except for K3;
+  # normally that should be 8, but 1 allows strong tokens to reach
+  # 0.99 instead of 0.125, which makes more sense for us
+  my $K1 = 0.25;
+  my $K2 = 10;
+  my $K3 = 1;
+
+  my $Sumf = $s + $n; die "assert: Sumf == 0" unless $Sumf;
+  my $WdotSumf = $weight * $Sumf;
+
+  # the resulting confidence factor equation
+  my $CF = ((((($NDfs - $NDfh) ** 2) + ($NDfs * $NDfh) - ($K1 / $Sumf))
+              / (($NDfs + $NDfh) ** 2)) ** $K2)
+            * ($WdotSumf / (1 + $K3 * $WdotSumf));
+
+  my $ratios = ($s / $ns);
+  my $ration = ($n / $nn);
+  my $prob;
+
+  if ($ratios == 0 && $ration == 0) {
+    warn "osbf: oops? ratios == ration == 0";
+    return;
+  } else {
+    $prob = ($ratios / ($ration + $ratios));
+    $prob = ($prob - 0.5) * $CF + 0.5;  # apply EDDC
+  }
+
+  # warn "JMD $s/$ns $n/$nn cf=$CF p=$prob";
+
+  return $prob;
+}
+
+###########################################################################
+# If a token is neither hammy nor spammy, return 0.
+# For a spammy token, return the minimum number of additional ham messages
+# it would have had to appear in to no longer be spammy.  Hammy tokens
+# are handled similarly.  That's what the function does (at the time
+# of this writing, 31 July 2003, 16:02:55 CDT).  It would be slightly
+# more useful if it returned the number of /additional/ ham messages
+# a spammy token would have to appear in to no longer be spammy but I
+# fear that might require the solution to a cubic equation, and I
+# just don't have the time for that now.
+
+sub compute_declassification_distance {
+  my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_;
+  return 0;
+  #TODO?  probably not
+}
+
+###########################################################################
+
+sub _opportunistic_calls {
+  my($self, $journal_only) = @_;
+
+  # If we're not already tied, abort.
+  if (!$self->{store}->db_readable()) {
+    dbg("osbf: opportunistic call attempt failed, DB not readable");
+    return;
+  }
+
+  # Is an expire or sync running?
+  my $running_expire = $self->{store}->get_running_expire_tok();
+  if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) {
+    dbg("osbf: opportunistic call attempt skipped, found fresh running expire magic token");
+    return;
+  }
+
+  # handle expiry and syncing
+  if (!$journal_only && $self->{store}->expiry_due()) {
+    dbg("osbf: opportunistic call found expiry due");
+
+    # sync will bring the DB R/W as necessary, and the expire will remove
+    # the running_expire token, may untie as well.
+    $self->{main}->{bayes_scanner}->sync(1,1);
+  }
+  elsif ( $self->{store}->sync_due() ) {
+    dbg("osbf: opportunistic call found journal sync due");
+
+    # sync will bring the DB R/W as necessary, may untie as well
+    $self->{main}->{bayes_scanner}->sync(1,0);
+
+    # We can only remove the running_expire token if we're doing R/W
+    if ($self->{store}->db_writable()) {
+      $self->{store}->remove_running_expire_tok();
+    }
+  }
+
+  return;
+}
+
+###########################################################################
+
+sub learner_new {
+  my ($self) = @_;
+
+  if ($self->{conf}->{osbf_store_module}) {
+    my $module = $self->{conf}->{osbf_store_module};
+    $module = untaint_var($module);  # good enough?
+    my $store;
+
+    eval '
+      require '.$module.';
+      $store = '.$module.'->new($self);
+      1;
+    ' or do {
+      my $eval_stat = $@ ne '' ? $@ : "errno=$!";  chomp $eval_stat;
+      die "osbf: (in new) $eval_stat\n";
+    };
+    $self->{store} = $store;
+  }
+  else {
+    require Mail::SpamAssassin::OSBF::Store::DBM;
+    $self->{store} = Mail::SpamAssassin::OSBF::Store::DBM->new($self);
+  }
+
+  $self;
+}
+
+1;
+
+=back
+
+=cut
Index: rules/v330.pre
===================================================================
--- rules/v330.pre	(.../trunk)	(revision 0)
+++ rules/v330.pre	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -0,0 +1,20 @@
+# This is the right place to customize your installation of SpamAssassin.
+#
+# See 'perldoc Mail::SpamAssassin::Conf' for details of what can be
+# tweaked.
+#
+# This file was installed during the installation of SpamAssassin 3.3.0,
+# and contains plugin loading commands for the new plugins added in that
+# release.  It will not be overwritten during future SpamAssassin installs,
+# so you can modify it to enable some disabled-by-default plugins below,
+# if you so wish.
+#
+# There are now multiple files read to enable plugins in the
+# /etc/mail/spamassassin directory; previously only one, "init.pre" was
+# read.  Now both "init.pre", "v310.pre", and any other files ending in
+# ".pre" will be read.  As future releases are made, new plugins will be
+# added to new files, named according to the release they're added in.
+###########################################################################
+
+# OSBF - EXPERIMENTAL alternative to Bayes
+# loadplugin Mail::SpamAssassin::Plugin::OSBF
Index: masses/bayes-testing/bayes-10pcv-driver
===================================================================
--- masses/bayes-testing/bayes-10pcv-driver	(.../trunk)	(revision 602913)
+++ masses/bayes-testing/bayes-10pcv-driver	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -56,6 +56,7 @@
 echo "
 
 bayes_path                $tmpdir/dbs/bayes
+osbf_path                $tmpdir/dbs/osbf
 bayes_auto_learn          0
 bayes_min_ham_num         10
 bayes_min_spam_num        10
@@ -76,6 +77,10 @@
   echo "Restoring full learned DBs..."
   ( cd $tmpdir; rm -rf dbs; tar xf learned-all.tar )
 }
+runcmd () {
+  echo "$*"
+  time $*
+}
 
 if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then
 
@@ -87,14 +92,14 @@
 
   (
   echo -n "Learning from all ham buckets..." ; date
-  time sa-learn --ham --randseed=1 --no-sync $learnargs \
+  runcmd sa-learn --ham --randseed=1 --no-sync $learnargs \
 	  --showdots --mbox --config-file=$tmpdir/rules $testdir/cor/ham/*
 
   echo -n "Learning from all spam buckets..." ; date
-  time sa-learn --spam --randseed=1 --no-sync $learnargs \
+  runcmd sa-learn --spam --randseed=1 --no-sync $learnargs \
 	  --showdots --mbox --config-file=$tmpdir/rules $testdir/cor/spam/*
 
-  time sa-learn --sync $learnargs --config-file=$tmpdir/rules
+  runcmd sa-learn --sync $learnargs --config-file=$tmpdir/rules
 
   echo -n "Done learning. " ; date
   ) 2>&1 | tee $results/learn.log
@@ -140,30 +145,30 @@
 
   if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then
     echo "Forgetting contents of test ham bucket..."
-    time sa-learn --forget --config-file=$tmpdir/rules --showdots \
+    runcmd sa-learn --forget --config-file=$tmpdir/rules --showdots \
 			  --mbox $rdir/hbuckettest
 
     echo "Forgetting contents of test spam bucket..."
-    time sa-learn --forget --config-file=$tmpdir/rules --showdots \
+    runcmd sa-learn --forget --config-file=$tmpdir/rules --showdots \
 			  --mbox $rdir/sbuckettest
 
   else
     echo "Learning contents of learn ham bucket..."
-    time sa-learn --ham --randseed=1 --no-sync $learnargs \
+    runcmd sa-learn --ham --randseed=1 --no-sync $learnargs \
 	    --showdots --mbox --config-file=$tmpdir/rules $rdir/hbucketlearn
 
     echo "Learning contents of learn spam bucket..."
-    time sa-learn --spam --randseed=1 --no-sync $learnargs \
+    runcmd sa-learn --spam --randseed=1 --no-sync $learnargs \
 	    --showdots --mbox --config-file=$tmpdir/rules $rdir/sbucketlearn
 
-    time sa-learn --sync $learnargs --config-file=$tmpdir/rules
+    runcmd sa-learn --sync $learnargs --config-file=$tmpdir/rules
 
     echo "Dumping bayes DB..."
     ( cd .. ; sa-learn --dump --dbpath=$tmpdir/dbs/bayes ) \
 	  > $rdir/bayes_db.dump
   fi
 
-  time sa-learn --sync --config-file=$tmpdir/rules
+  runcmd sa-learn --sync --config-file=$tmpdir/rules
 
   # take a copy of the trained Bayes DBs, gzipped
   ( cd $tmpdir ; tar cf - dbs | gzip -c > $rdir/dbs.tgz )
@@ -198,12 +203,12 @@
 
   else
     echo "Running mass-check on ham bucket..."
-    time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
+    runcmd ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
 	  --bayes --mbox $rdir/hbuckettest \
 	  > $rdir/nonspam.log
 
     echo "Running mass-check on spam bucket..."
-    time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
+    runcmd ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
 	  --bayes --mbox $rdir/sbuckettest \
 	  > $rdir/spam.log
   fi
Index: rules/v330.pre
===================================================================
--- rules/v330.pre	(.../trunk)	(revision 0)
+++ rules/v330.pre	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -0,0 +1,20 @@
+# This is the right place to customize your installation of SpamAssassin.
+#
+# See 'perldoc Mail::SpamAssassin::Conf' for details of what can be
+# tweaked.
+#
+# This file was installed during the installation of SpamAssassin 3.3.0,
+# and contains plugin loading commands for the new plugins added in that
+# release.  It will not be overwritten during future SpamAssassin installs,
+# so you can modify it to enable some disabled-by-default plugins below,
+# if you so wish.
+#
+# There are now multiple files read to enable plugins in the
+# /etc/mail/spamassassin directory; previously only one, "init.pre" was
+# read.  Now both "init.pre", "v310.pre", and any other files ending in
+# ".pre" will be read.  As future releases are made, new plugins will be
+# added to new files, named according to the release they're added in.
+###########################################################################
+
+# OSBF - EXPERIMENTAL alternative to Bayes
+# loadplugin Mail::SpamAssassin::Plugin::OSBF
Index: masses/bayes-testing/bayes-10pcv-driver
===================================================================
--- masses/bayes-testing/bayes-10pcv-driver	(.../trunk)	(revision 602913)
+++ masses/bayes-testing/bayes-10pcv-driver	(.../branches/bug-5293-pluginized-bayes)	(revision 602913)
@@ -56,6 +56,7 @@
 echo "
 
 bayes_path                $tmpdir/dbs/bayes
+osbf_path                $tmpdir/dbs/osbf
 bayes_auto_learn          0
 bayes_min_ham_num         10
 bayes_min_spam_num        10
@@ -76,6 +77,10 @@
   echo "Restoring full learned DBs..."
   ( cd $tmpdir; rm -rf dbs; tar xf learned-all.tar )
 }
+runcmd () {
+  echo "$*"
+  time $*
+}
 
 if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then
 
@@ -87,14 +92,14 @@
 
   (
   echo -n "Learning from all ham buckets..." ; date
-  time sa-learn --ham --randseed=1 --no-sync $learnargs \
+  runcmd sa-learn --ham --randseed=1 --no-sync $learnargs \
 	  --showdots --mbox --config-file=$tmpdir/rules $testdir/cor/ham/*
 
   echo -n "Learning from all spam buckets..." ; date
-  time sa-learn --spam --randseed=1 --no-sync $learnargs \
+  runcmd sa-learn --spam --randseed=1 --no-sync $learnargs \
 	  --showdots --mbox --config-file=$tmpdir/rules $testdir/cor/spam/*
 
-  time sa-learn --sync $learnargs --config-file=$tmpdir/rules
+  runcmd sa-learn --sync $learnargs --config-file=$tmpdir/rules
 
   echo -n "Done learning. " ; date
   ) 2>&1 | tee $results/learn.log
@@ -140,30 +145,30 @@
 
   if [ $LEARN_ALL_THEN_FORGET_TEST_SET = 1 ] ; then
     echo "Forgetting contents of test ham bucket..."
-    time sa-learn --forget --config-file=$tmpdir/rules --showdots \
+    runcmd sa-learn --forget --config-file=$tmpdir/rules --showdots \
 			  --mbox $rdir/hbuckettest
 
     echo "Forgetting contents of test spam bucket..."
-    time sa-learn --forget --config-file=$tmpdir/rules --showdots \
+    runcmd sa-learn --forget --config-file=$tmpdir/rules --showdots \
 			  --mbox $rdir/sbuckettest
 
   else
     echo "Learning contents of learn ham bucket..."
-    time sa-learn --ham --randseed=1 --no-sync $learnargs \
+    runcmd sa-learn --ham --randseed=1 --no-sync $learnargs \
 	    --showdots --mbox --config-file=$tmpdir/rules $rdir/hbucketlearn
 
     echo "Learning contents of learn spam bucket..."
-    time sa-learn --spam --randseed=1 --no-sync $learnargs \
+    runcmd sa-learn --spam --randseed=1 --no-sync $learnargs \
 	    --showdots --mbox --config-file=$tmpdir/rules $rdir/sbucketlearn
 
-    time sa-learn --sync $learnargs --config-file=$tmpdir/rules
+    runcmd sa-learn --sync $learnargs --config-file=$tmpdir/rules
 
     echo "Dumping bayes DB..."
     ( cd .. ; sa-learn --dump --dbpath=$tmpdir/dbs/bayes ) \
 	  > $rdir/bayes_db.dump
   fi
 
-  time sa-learn --sync --config-file=$tmpdir/rules
+  runcmd sa-learn --sync --config-file=$tmpdir/rules
 
   # take a copy of the trained Bayes DBs, gzipped
   ( cd $tmpdir ; tar cf - dbs | gzip -c > $rdir/dbs.tgz )
@@ -198,12 +203,12 @@
 
   else
     echo "Running mass-check on ham bucket..."
-    time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
+    runcmd ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
 	  --bayes --mbox $rdir/hbuckettest \
 	  > $rdir/nonspam.log
 
     echo "Running mass-check on spam bucket..."
-    time ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
+    runcmd ./mass-check -c=$tmpdir/rules -p=$tmpdir/rules --showdots \
 	  --bayes --mbox $rdir/sbuckettest \
 	  > $rdir/spam.log
   fi