diff -u -N -r Mail-SpamAssassin-2.60-dist/Makefile.PL Mail-SpamAssassin-2.60/Makefile.PL --- Mail-SpamAssassin-2.60-dist/Makefile.PL Fri Jun 27 11:48:37 2003 +++ Mail-SpamAssassin-2.60/Makefile.PL Mon Jun 30 16:22:34 2003 @@ -90,7 +90,7 @@ 'VERSION_FROM' => 'lib/Mail/SpamAssassin.pm', # finds $VERSION 'EXE_FILES' => [ - 'spamassassin', 'sa-learn', + 'spamassassin', 'sa-learn', 'sa-btok-learn', @SPAMD_EXE_FILES ], @@ -322,6 +322,10 @@ $(PERL) build/preprocessor $(FIXBYTES) $(FIXVARS) $(FIXBANG) -i$? -o$@ $(CHMOD) $(PERM_RWX) $@ +sa-btok-learn: sa-btok-learn.raw + $(PERL) build/preprocessor $(FIXBYTES) $(FIXVARS) $(FIXBANG) -i$? -o$@ + $(CHMOD) $(PERM_RWX) $@ + spamd/spamd: spamd/spamd.raw $(PERL) build/preprocessor $(FIXBYTES) $(FIXVARS) $(FIXBANG) -i$? -o$@ $(CHMOD) $(PERM_RWX) $@ diff -u -N -r Mail-SpamAssassin-2.60-dist/lib/Mail/SpamAssassin/Bayes.pm Mail-SpamAssassin-2.60/lib/Mail/SpamAssassin/Bayes.pm --- Mail-SpamAssassin-2.60-dist/lib/Mail/SpamAssassin/Bayes.pm Fri Jun 27 11:48:36 2003 +++ Mail-SpamAssassin-2.60/lib/Mail/SpamAssassin/Bayes.pm Mon Jun 30 16:28:19 2003 @@ -29,6 +29,10 @@ use strict; use bytes; +use Sys::Hostname; +use File::Path; +use File::Spec; +use File::Basename; use Mail::SpamAssassin; use Mail::SpamAssassin::BayesStore; @@ -295,6 +299,7 @@ } my @toks = @{$self->{tokens}}; delete $self->{tokens}; + ($wc, @toks); } @@ -606,6 +611,39 @@ ########################################################################### +sub learn_tokens { + my ($self, $toks, $id, $isspam) = @_; + + my $ret; + + eval { + local $SIG{'__DIE__'}; # do not run user die() traps in here + + my $ok; + if ($self->{main}->{learn_to_journal}) { + $ok = $self->{store}->tie_db_readonly(); + } else { + $ok = $self->{store}->tie_db_writable(); + } + + if ($ok) { + $ret = $self->learn_trapped_tokens ($toks, $id, $isspam); + + if (!$self->{main}->{learn_caller_will_untie}) { + $self->{store}->untie_db(); + } + } + }; + + if ($@) { # if we died, untie the dbs. + my $failure = $@; + $self->{store}->untie_db(); + die $failure; + } + + return $ret; +} + sub learn { my ($self, $isspam, $msg) = @_; @@ -686,6 +724,49 @@ 1; } +sub learn_trapped_tokens { + my ($self, $toks, $id, $isspam) = @_; + + my $seen = $self->{store}->seen_get($id); + + if (defined($seen)) { + if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) { + dbg ("$id: already learnt correctly, not learning twice"); + return; + } elsif ($seen !~ /^[hs]$/) { + warn ("db_seen corrupt: value='$seen' for $id. ignored"); + } else { + dbg ("$id: already learnt as opposite, forgetting first"); + $self->forget_tokens ($toks, $id); + } + } + + if ($isspam) { + $self->{store}->nspam_nham_change(1,0); + } else { + $self->{store}->nspam_nham_change(0,1); + } + + my @tokens = @$toks; + my %seen = (); + + my $msgatime = time(); # this is sort of incorrect. + + for (@tokens) { + if ($seen{$_}) { next; } else { $seen{$_} = 1; } + + if ($isspam) { + $self->{store}->tok_count_change (1, 0, $_, $msgatime); + } else { + $self->{store}->tok_count_change (0, 1, $_, $msgatime); + } + } + + $self->{store}->seen_put ($id, ($isspam ? 's' : 'h')); + $self->{store}->add_touches_to_journal(); + 1; +} + ########################################################################### sub forget { @@ -719,6 +800,36 @@ return $ret; } +sub forget_tokens { + my ($self, $toks, $id) = @_; + + if (!$self->{conf}->{use_bayes}) { return; } + my $ret; + + # we still tie for writing here, since we write to the seen db + # synchronously + eval { + local $SIG{'__DIE__'}; # do not run user die() traps in here + + if ($self->{store}->tie_db_writable()) { + $ret = $self->forget_trapped_tokens ($toks, $id); + + if (!$self->{main}->{learn_caller_will_untie}) { + $self->{store}->untie_db(); + } + } + }; + + if ($@) { # if we died, untie the dbs. + my $failure = $@; + $self->{store}->untie_db(); + die $failure; + } + + return $ret; +} + + # this function is trapped by the wrapper above sub forget_trapped { my ($self, $msg, $body) = @_; @@ -750,6 +861,47 @@ my %seen = (); for (@tokens) { if ($seen{$_}) { next; } else { $seen{$_} = 1; } + + if ($isspam) { + $self->{store}->tok_count_change (-1, 0, $_); + } else { + $self->{store}->tok_count_change (0, -1, $_); + } + } + + $self->{store}->seen_delete ($msgid); + 1; +} + +sub forget_trapped_tokens { + my ($self, $toks, $msgid) = @_; + + my $seen = $self->{store}->seen_get ($msgid); + my $isspam; + if (defined ($seen)) { + if ($seen eq 's') { + $isspam = 1; + } elsif ($seen eq 'h') { + $isspam = 0; + } else { + dbg ("forget: message $msgid seen entry is neither ham nor spam, ignored"); + return; + } + } else { + dbg ("forget: message $msgid not learnt, ignored"); + return; + } + + if ($isspam) { + $self->{store}->nspam_nham_change (-1, 0); + } else { + $self->{store}->nspam_nham_change (0, -1); + } + + my @tokens = @$toks; + my %seen = (); + for (@tokens) { + if ($seen{$_}) { next; } else { $seen{$_} = 1; } if ($isspam) { $self->{store}->tok_count_change (-1, 0, $_); diff -u -N -r Mail-SpamAssassin-2.60-dist/lib/Mail/SpamAssassin/BayesStore.pm Mail-SpamAssassin-2.60/lib/Mail/SpamAssassin/BayesStore.pm --- Mail-SpamAssassin-2.60-dist/lib/Mail/SpamAssassin/BayesStore.pm Fri Jun 27 11:48:36 2003 +++ Mail-SpamAssassin-2.60/lib/Mail/SpamAssassin/BayesStore.pm Fri Jun 27 14:28:55 2003 @@ -136,7 +136,6 @@ dbg ("bayes_path not defined"); return 0; } - my $path = $main->sed_path ($main->{conf}->{bayes_path}); my $found=0; diff -u -N -r Mail-SpamAssassin-2.60-dist/lib/Mail/SpamAssassin/BayesTokenLog.pm Mail-SpamAssassin-2.60/lib/Mail/SpamAssassin/BayesTokenLog.pm --- Mail-SpamAssassin-2.60-dist/lib/Mail/SpamAssassin/BayesTokenLog.pm Wed Dec 31 16:00:00 1969 +++ Mail-SpamAssassin-2.60/lib/Mail/SpamAssassin/BayesTokenLog.pm Fri Jun 27 15:02:44 2003 @@ -0,0 +1,49 @@ +=head1 NAME + +Mail::SpamAssassin::BayesTokenLog - per message logging of Bayes Tokens for +use in training bayes. + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +=head1 METHODS + +=over 4 + +=cut + +package Mail::SpamAssassin::BayesTokenLog; + +use strict; + +use Mail::SpamAssassin; +use Mail::SpamAssassin::Conf; + +use vars qw{ + @ISA +}; + +@ISA = qw(); + + +########################################################################### + +sub new { + my $class = shift; + $class = ref($class) || $class; + my ($main) = @_; + my $self = { + 'main' => $main, + 'conf' => $main->{conf}, + }; + + bless ($self, $class); + + $self; +} + + +########################################################################### + +1; diff -u -N -r Mail-SpamAssassin-2.60-dist/lib/Mail/SpamAssassin/Conf.pm Mail-SpamAssassin-2.60/lib/Mail/SpamAssassin/Conf.pm --- Mail-SpamAssassin-2.60-dist/lib/Mail/SpamAssassin/Conf.pm Fri Jun 27 11:48:36 2003 +++ Mail-SpamAssassin-2.60/lib/Mail/SpamAssassin/Conf.pm Fri Jun 27 16:12:12 2003 @@ -2384,6 +2384,53 @@ $self->{user_scores_sql_field_scope} = $1; next; } +=item state_dir_hash_base field_value + +The base path for the state_dir_hash + +=cut + if(/^state_dir_hash_base\s+(\S+)$/) { + $self->{state_dir_hash_base} = $1; next; + } + +=item use_state_dir_hash field_value + +set to 1 to enable + +=cut + if(/^use_state_dir_hash\s+(\S+)$/) { + $self->{use_state_dir_hash} = $1; next; + } + +=item per_msg_bayes_token_log field_value + +set to 1 to enable + +=cut + if(/^per_msg_bayes_token_log\s+(\S+)$/) { + $self->{per_msg_bayes_token_log} = $1; next; + } + +=item per_msg_bayes_token_dir field_value + +set to something like __statedir__/bayes_loken_log + +=cut + if(/^per_msg_bayes_token_dir\s+(\S+)$/) { + $self->{per_msg_bayes_token_dir} = $1; next; + } + +=item per_msg_bayes_token_retries field_value + +number of times to retry for the token file + +=cut + if(/^per_msg_bayes_token_retries\s+(\S+)$/) { + $self->{per_msg_bayes_token_retries} = $1; next; + } + + + ########################################################################### failed_line: diff -u -N -r Mail-SpamAssassin-2.60-dist/lib/Mail/SpamAssassin/PerMsgStatus.pm Mail-SpamAssassin-2.60/lib/Mail/SpamAssassin/PerMsgStatus.pm --- Mail-SpamAssassin-2.60-dist/lib/Mail/SpamAssassin/PerMsgStatus.pm Fri Jun 27 11:48:36 2003 +++ Mail-SpamAssassin-2.60/lib/Mail/SpamAssassin/PerMsgStatus.pm Fri Jun 27 17:07:49 2003 @@ -112,6 +112,16 @@ $self->{head_only_hits} = 0; $self->{hits} = 0; + my $toks; + + # Log all bayesian tokens to a file for later training + if ($self->{conf}->{per_msg_bayes_token_log}) + { + $toks = join("\n", $self->{main}->{bayes_scanner}->tokenize( + $self->{msg}, $self->{main}->{bayes_scanner}->get_body_from_msg( + $self->{msg})))."\n"; + } + # Resident Mail::SpamAssassin code will possibly never change score # sets, even if bayes becomes available. So we should do a quick check # to see if we should go from {0,1} to {2,3}. We of course don't need @@ -237,6 +247,41 @@ $report =~ s/\n*$/\n\n/s; $self->{report} = $report; + if ($self->{conf}->{per_msg_bayes_token_log}) + { + my $path = $self->{main}->sed_path($self->{conf}->{per_msg_bayes_token_dir}); + + if (! -d $path) + { + dbg("Making (mkdiring) path for token log: $path"); + eval { mkpath($path, 0, 0700) } or dbg ("mkdir $path failed: $@ $!"); + } + + my $file = $self->get_maildir_file($path); + dbg("Writing token log to file: $file"); + + if (!open(F, ">$file")) + { + dbg("Couldn't open file $file for writing of tokens: $!"); + return; + } + + my $hits = $self->{hits}."\n"; + my $is_spam = $self->{is_spam} ? "1\n" : "0\n"; + my $from = $self->{msg}->get_pristine_header('From'); + my $date = $self->{msg}->get_pristine_header('Date'); + my $subj = $self->{msg}->get_pristine_header('Subject'); + + print F $hits; + print F $is_spam; + print F $from; + print F $date; + print F $subj; + + print F $toks; + + close(F); + } } ########################################################################### @@ -2507,6 +2552,23 @@ return ($reportfile, \*TMPFILE); } +sub get_maildir_file { + my ($self, $dir) = @_; + + my $ctime = time(); + my $pid = $$; + my $hostname = hostname(); + my $i; + + for ( $i = 0; $i < $self->{conf}->{per_msg_bayes_token_retries}; $i++ ) { + my $name = $ctime . '.' .$pid . '_' . $i . '.' . $hostname; + return ("$dir/$name") if ( ! -e "$dir/$name" ); + } + + return(''); #return undef if it hit the max. + +} + ########################################################################### 1; diff -u -N -r Mail-SpamAssassin-2.60-dist/lib/Mail/SpamAssassin/StateDirHash.pm Mail-SpamAssassin-2.60/lib/Mail/SpamAssassin/StateDirHash.pm --- Mail-SpamAssassin-2.60-dist/lib/Mail/SpamAssassin/StateDirHash.pm Wed Dec 31 16:00:00 1969 +++ Mail-SpamAssassin-2.60/lib/Mail/SpamAssassin/StateDirHash.pm Fri Jun 27 14:03:20 2003 @@ -0,0 +1,61 @@ +=head1 NAME + +Mail::SpamAssassin::StateDirHash - determine StateDir Location + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +=head1 METHODS + +=over 4 + +=cut + +package Mail::SpamAssassin::StateDirHash; + +use strict; + +use Mail::SpamAssassin; +use Mail::SpamAssassin::Conf; + +use vars qw{ + @ISA +}; + +@ISA = qw(); + + +########################################################################### + +sub new { + my $class = shift; + $class = ref($class) || $class; + my ($main) = @_; + my $self = { + 'main' => $main, + 'conf' => $main->{conf}, + }; + + bless ($self, $class); + + $self; +} + +sub get_state_dir { + my ($self, $user) = @_; + + my $state_dir = ''; + + if ( $user =~ /^(.)(.)/ ) { + $state_dir = $self->{conf}->{state_dir_hash_base} . "/$1/$2/$user/.spamassassin"; + } + + return ($state_dir); +} + + + +########################################################################### + +1; diff -u -N -r Mail-SpamAssassin-2.60-dist/lib/Mail/SpamAssassin.pm Mail-SpamAssassin-2.60/lib/Mail/SpamAssassin.pm --- Mail-SpamAssassin-2.60-dist/lib/Mail/SpamAssassin.pm Fri Jun 27 11:48:36 2003 +++ Mail-SpamAssassin-2.60/lib/Mail/SpamAssassin.pm Fri Jun 27 14:44:36 2003 @@ -58,6 +58,7 @@ use 5.005; use Mail::SpamAssassin::Conf; +use Mail::SpamAssassin::StateDirHash; use Mail::SpamAssassin::ConfSourceSQL; use Mail::SpamAssassin::PerMsgStatus; use Mail::SpamAssassin::NoMailAudit; @@ -1263,11 +1264,21 @@ # user state directory my $fname = $self->{userstate_dir}; + + # get statedir hash from module if the config says so + if ($self->{conf}->{use_state_dir_hash}) + { + my $sdh = Mail::SpamAssassin::StateDirHash->new($self); + my $dir = $sdh->get_state_dir($self->{username}); + + unshift(@default_userstate_dir, $dir); + } + $fname ||= $self->first_existing_path (@default_userstate_dir); # If vpopmail is enabled then set fname to virtual homedir # - if (defined $self->{user_dir}) { + if (!$self->{conf}->{use_state_dir_hash} && defined $self->{user_dir}) { $fname = File::Spec->catdir ($self->{user_dir}, ".spamassassin"); } @@ -1374,7 +1385,7 @@ $path =~ s/__local_rules_dir__/$self->{LOCAL_RULES_DIR} || ''/ges; $path =~ s/__def_rules_dir__/$self->{DEF_RULES_DIR} || ''/ges; $path =~ s{__prefix__}{$self->{PREFIX} || $Config{prefix} || '/usr'}ges; - $path =~ s{__userstate__}{$self->get_and_create_userstate_dir()}ges; + $path =~ s/__userstate__/$self->get_and_create_userstate_dir()/ges; $path =~ s/^\~([^\/]*)/$self->expand_name($1)/es; return Mail::SpamAssassin::Util::untaint_file_path ($path); @@ -1383,6 +1394,7 @@ sub first_existing_path { my $self = shift; my $path; + foreach my $p (@_) { $path = $self->sed_path ($p); if (defined $path && -e $path) { return $path; } diff -u -N -r Mail-SpamAssassin-2.60-dist/rules/local.cf Mail-SpamAssassin-2.60/rules/local.cf --- Mail-SpamAssassin-2.60-dist/rules/local.cf Fri Jun 27 11:48:36 2003 +++ Mail-SpamAssassin-2.60/rules/local.cf Fri Jun 27 13:40:00 2003 @@ -9,3 +9,4 @@ # report_safe 1 # trusted_networks 212.17.35. +state_dir_hash_base /tmp diff -u -N -r Mail-SpamAssassin-2.60-dist/sa-btok-learn.raw Mail-SpamAssassin-2.60/sa-btok-learn.raw --- Mail-SpamAssassin-2.60-dist/sa-btok-learn.raw Wed Dec 31 16:00:00 1969 +++ Mail-SpamAssassin-2.60/sa-btok-learn.raw Mon Jun 30 16:23:19 2003 @@ -0,0 +1,94 @@ +#!/usr/bin/perl -w + +use strict; +use bytes; + +use Mail::SpamAssassin; +use Mail::SpamAssassin::Bayes; +use Mail::SpamAssassin::PerMsgLearner; + +# create the tester factory +my $spamtest = new Mail::SpamAssassin ({ + rules_filename => 0, + userprefs_filename => 0, # perhaps use later + debug => 0, + local_tests_only => 1, + dont_copy_prefs => 1, + }); + +$spamtest->init (1); + +$spamtest->init_learner({ + force_expire => 0, + learn_to_journal => 0, + wait_for_lock => 1, + caller_will_untie => 1 + }); + +# sync the journal first if we're going to go r/w so we make sure to +# learn everything before doing anything else. +# +$spamtest->rebuild_learner_caches(); + +my $processed = 0; + +while (<>) +{ + my @a = split(/\s/, $_); + &do($a[0], $a[1]); + $processed++; +} + +print "Learned from $processed messages.\n"; + +$spamtest->finish_learner(); +exit(0); + +sub do { + my ($file, $isspam) = @_; + + # message ID is the last directory piece (the filename) + my @id = split(/\//, $file); + my $id = $id[scalar(@id)-1]; + + my @toks; + + if (!open (F, $file)) + { + warn "Couldn't open file $file for reading of tokens: $!"; + } else { + + # always ignore the first 6 lines. + ; + ; + ; + ; + ; + ; + + # copy out the tokens + while () + { + chomp; + push (@toks, $_); + } + + close(F); + + eval { + my $status = &learn_tokens(\@toks, $id, $isspam); + }; + + if ($@) + { + print "Couldn't learn tokens out of file $file: $@"; + } + } +} + +sub learn_tokens +{ + my ($toks, $id, $isspam) = @_; + + $spamtest->{bayes_scanner}->learn_tokens($toks, $id, $isspam); +} diff -u -N -r Mail-SpamAssassin-2.60-dist/spamd/spamd.raw Mail-SpamAssassin-2.60/spamd/spamd.raw --- Mail-SpamAssassin-2.60-dist/spamd/spamd.raw Fri Jun 27 11:48:35 2003 +++ Mail-SpamAssassin-2.60/spamd/spamd.raw Fri Jun 27 13:44:16 2003 @@ -105,6 +105,8 @@ 'vpopmail!' => \$opt{'vpopmail'}, 'v' => \$opt{'vpopmail'}, 'configpath|C=s' => \$opt{'configpath'}, + 'use-statedir-hash' => \$opt{'use_statedir_hash'}, + 'U' => \$opt{'use_statedir_hash'}, 'user-config' => \$opt{'user-config'}, 'nouser-config|x' => sub{ $opt{'user-config'} = 0 }, 'allowed-ips|A=s' => \@{$opt{'allowed-ip'}}, @@ -222,6 +224,7 @@ rules_filename => ($opt{'configpath'} || 0), local_tests_only => ($opt{'local'} || 0), debug => ($opt{'debug'} || 0), + use_statedir_hash => ($opt{'use_statedir_hash'} || 0), paranoid => ($opt{'paranoid'} || 0), home_dir_for_helpers => (defined $opt{'home_dir_for_helpers'} ? $opt{'home_dir_for_helpers'} : $orighome), PREFIX => $PREFIX,