Index: lib/Mail/SpamAssassin/Masses.pm =================================================================== --- lib/Mail/SpamAssassin/Masses.pm (revision 0) +++ lib/Mail/SpamAssassin/Masses.pm (revision 0) @@ -0,0 +1,833 @@ +# <@LICENSE> +# Copyright 2004 Apache Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +=head1 NAME + +Mail::SpamAssassin::Masses - Interface for reading and parsing rules +and mass-check logs for SpamAssassin + +=head1 SYNOPSIS + + my $parser = Mail::SpamAssassin::Masses->new(); + my $rules = $parser->readrules(); + my $logs = $parser->readlogs(); + + foreach my $test (keys %$rules) { + if ($rules->{$test}->{score} > 1) { + ... + } + +=head1 DESCRIPTION + +Mail::SpamAssassin::Masses is a module to simplify the many scripts +that used to make up the SpamAssassin re-scoring process. By +consolidating all the shared code in one module, the scripts can be +simplified and require fewer temporary files. + +=head1 METHODS + +=over 4 + +=cut + +package Mail::SpamAssassin::Masses; + +use strict; +use warnings; +use Carp; + +=item $parser = Mail::SpamAssassin::Masses->new( [ { opt => val, ... } ] ); + +Construct a new Mail::SpamAssassin::Masses object. You may pass the +following attribute-value pairs to the constructor. + +=over 4 + +=item rulesdir + +The directory containing rules. If multiple directories are desired, +an anonymous array should be passed. + +=item scoreset + +Scoreset to deal with. + +=item logfile + +Filename of mass-check log. + +=item falses + +Also count frequencies for false positives and false negatives from +the logs. + +=item falsesonly + +Only count false positives and false negatives. + +=item greprule + +Coderef that is passed a rule name and a hash ref with the entries +containing info about the rule. If the sub returns false, it is skipped. + +=item greplog + +Coderef that is passed a raw log entry. If it returns false, the entry +is skipped. + +=item sliding_window + +Use a sliding window for score ranges rather than a shrinking window. + +=item nologs + +Save memory by not saving the individual log results, just the +aggregate totals + +=back + +=cut + +sub new { + + my $class = shift; + $class = ref($class) || $class; + + my $self = shift; + if (!defined $self){ + $self = { }; + } + + $self->{scoreset} ||= 0; + $self->{rulesdir} ||= ''; + $self->{logfile} ||= "masses.log"; + + bless($self, $class); + + return $self; + +} + +=item $parser->readrules() + +Read and parse the rules from the directory specified as +C. This loads the following keys and values into the hash +entry representing the rules (see below). + +=over 4 + +=item name + +Contains the rule's name. + +=item score + +Contains the rule's score. + +=item type + +Contains the rule's type (header, body, uri, etc.) + +=item tflags + +Contains the rules tflags (nice, autolearn, etc.) as specified in the config file. + +=item lang + +Set to the value of C for language-specific tests. + +=item issubrule + +Set to true if the rules is a sub-rule, (i.e. it starts with +__). Otherwise, undefined. + +=item isnice + +This key exists and is true if the rule is nice (i.e. with a score +that can be below zero). + +=item describe + +Set to the rule's description, in English, or in the rule's language. + +=back + +There may be more values once C is run. + +=cut + + +sub readrules { + + my $self = shift; + + $self->{rules} ||= { }; + my $rules = $self->{rules}; # $rules is a reference to the anon hash + + my @dirs = ref($self->{rulesdir}) ? @{$self->{rulesdir}} : $self->{rulesdir}; + + foreach my $indir (@dirs) { + my @files = glob("$indir/*.cf"); # no reason to only do numbered files + + foreach my $file (@files) { + open (IN, "<$file") || croak("Can't open $file, $!"); + while() { + s/#.*$//g; + s/^\s+//; + s/\s+$//; + next if /^$/; + + my $lang = ''; + if (s/^lang\s+(\S+)\s+//) { + $lang = lc $1; + } + + if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) { + my $type = $1; + my $name = $2; + + $rules->{$name} ||= { }; + $rules->{$name}->{name} = $name; + $rules->{$name}->{type} = $type; + $rules->{$name}->{lang} = $lang if $lang; + $rules->{$name}->{tflags} = ''; + + if ($name =~ /^__/) { + $rules->{$name}->{issubrule} = '1'; + } + + } elsif (/^describe\s+(\S+)\s+(.+)$/) { + + # Let's get description in english, por favor -- unless the rule isn't english + + next if ($lang && (!$rules->{$1}->{lang} || $rules->{$1}->{lang} ne $lang)); + + $rules->{$1} ||= { }; + $rules->{$1}->{describe} = $2; + + } elsif (/^tflags\s+(\S+)\s+(.+)$/) { + my $name = $1; + $rules->{$name} ||= { }; + $rules->{$name}->{tflags} = $2; + if ($2 =~ /nice/) { + $rules->{$name}->{isnice} = 1; + } + } elsif (/^score\s+(\S+)\s+(.+)$/) { + my($name,$score) = ($1,$2); + $rules->{$name} ||= { }; + if ( $score =~ /\s/ ) { # there are multiple scores + ($score) = (split(/\s+/,$score))[$self->{scoreset}]; + } + $rules->{$name}->{score} = $score; + } + } + close IN; + } + } + foreach my $rule (keys %{$rules}) { + if (!defined $rules->{$rule}->{type}) { + delete $rules->{$rule}; # no rule definition -> no rule + next; + } + + if (!defined $rules->{$rule}->{score}) { + my $def = 1.0; + if ($rule =~ /^T_/) { $def = 0.01; } + + if ($rules->{$rule}->{isnice}) { + $rules->{$rule}->{score} = -$def; + } else { + $rules->{$rule}->{score} = $def; + } + } + + if ($self->{greprules} && !&{$self->{greprules}}($rule, $rules->{$rule})) + { + delete $rules->{$rule}; + next; + } + + } + + $self->{_readrules} = 1; +} + +=item $parser->readlogs() + +Read and parse logs from C. This will create the anonymous +array of hashes referred to by C<$parser->{logs}>, with the following +keys: + +=over 4 + +=item isspam + +True if the message is spam. False or undefined otherwise. + +=item isfalse + +True if the message was a false negative or positive. + +=item tests_hit + +Array reference containing references to the hash representing each +rule hit. + +=item score + +Score the message received (under current scores). + +=back + +In addition, this method adds the following keys to the rule +information in C<$parser->{rules}>. + +=over 4 + +=item freq_spam + +Frequency hit in spam. + +=item freq_ham + +Frequency hit in ham. + +=item freq_fp + +Frequency in false positives. + +=item freq_fn + +Frequency in false negatives. + +=back + +Also, sets C<$parser->{num_spam}> and C<$parser->{num_ham}> to the number of +spam logs read and the number of ham logs read, respectively. + +=cut + +sub readlogs { + + my $self = shift; + + if (!$self->{_readrules}) { + # need to read scores first! + $self->readrules(); + } + + my $rules = $self->{rules}; # copy the ref, shorthand + + my $logs; + if (! $self->{nologs}) { + $self->{logs} ||= [ ]; + $logs = $self->{logs}; + } + + + my ($num_spam, $num_ham, $count, $num_fp, $num_fn); + $num_spam = $num_ham = $count = $num_fp = $num_fn = 0; + + # First, initialize stuff + foreach my $rule (values %{$self->{rules}}) { + $rule->{freq_spam} ||= 0; + $rule->{freq_ham} ||= 0; + + if($self->{falses}) { + $rule->{freq_fp} ||= 0; + $rule->{freq_fn} ||= 0; + } + + } + + my $file = $self->{logfile}; + open (IN, "<$file"); + + while () { + next if /^\#/; + next if /^$/; + if($_ !~ /^(.)\s+(.)\s+-?[\d.]+\s+\S+(\s+\S+\s+)/) { warn "bad line: $_"; next; } + + if ($self->{greplogs} && !&{$self->{greplogs}}($_)) { + next; + } + + my $manual = $1; + my $result = $2; + $_ = $3; + s/(?:bayes|time)=\S+//; + s/,,+/,/g; + s/^\s+//; + s/\s+$//; + + + if ($manual ne $result) { + $self->{isfalse} = 1; + } + elsif ($self->{falsesonly}) { + next; + } + + if ($manual eq "s") { + $num_spam++; + $logs->[$count]->{isspam} = 1 unless $self->{nologs}; + $num_fn++ if $result eq "h"; + } else { + $num_ham++; + $num_fp++ if $result eq "s"; + } + + my @tests = (); + my $score = 0; + foreach my $tst (split (/,/, $_)) { + next if ($tst eq ''); + + # Don't count non-existant rules + # (Could happen with greprules) + next if ( !$rules->{$tst} || !$rules->{$tst}->{type} ); + + if ($manual eq "s") { + $rules->{$tst}->{freq_spam}++; + $rules->{$tst}->{freq_fn}++ if ($self->{falses} && $result eq "h"); + } + else { + $rules->{$tst}->{freq_ham}++; + $rules->{$tst}->{freq_fp}++ if ($self->{falses} && $result eq "s"); + } + + $score += $rules->{$tst}->{score}; + + push (@tests, $rules->{$tst}) unless $self->{nologs}; + } + + $logs->[$count]->{tests_hit} = \@tests unless $self->{nologs}; + $logs->[$count]->{score} = $score; + + $count++; + } + close IN; + + $self->{num_spam} = $num_spam; + $self->{num_ham} = $num_ham; + if ($self->{falses}) { + $self->{num_fn} = $num_fn; + $self->{num_fp} = $num_fp; + } + + $self->{_readlogs} = 1; # Done reading logs + +} + +=item $parser->do_statistics(); + +Calculate the S/O ratio and the rank for each test. + +This adds the following keys to the rules hashes. + +=over 4 + +=item spam_percent + +Percentage of spam messages hit. + +=item ham_percent + +Percentage of ham messages hit. + +=item soratio + +S/O ratio -- percentage of spam messages hit divided by total +percentage of messages hit. + +=back + +=cut + +sub do_statistics { + my $self = shift; + + if (! $self->{_readlogs} ) { + $self->readlogs(); + } + + my $rank_hi=0; + my $rank_lo=999999; + + foreach my $rule (values %{$self->{rules}}) { + + if (!$rule->{freq_spam}) { + $rule->{spam_percent} = 0; + } else { + $rule->{spam_percent} = $rule->{freq_spam} / $self->{num_spam} * 100.0; + } + + if (!$rule->{freq_ham}) { + $rule->{ham_percent} = 0; + } else { + $rule->{ham_percent} = $rule->{freq_ham} / $self->{num_ham} * 100.0; + } + + if (!$rule->{freq_spam} && !$rule->{freq_ham}) { + $rule->{soratio} = 0.5; + next; + } + + $rule->{soratio} = $rule->{spam_percent} / ($rule->{spam_percent} + $rule->{ham_percent}); + + } + + $self->{_statistics} = 1; + +} + +=item $parser->do_rank(); + +Calculates the ranking for each rule and stores this in the +appropriate key. + +=over 4 + +=item rank + +"Rank" of the rule. High numbers are good, low are bad. + +=back + +=cut + +sub do_rank { + + my $self = shift; + + if (! $self->{_statistics} ) { + $self->do_statistics(); + } + + my $rank_hi = 0; + my $rank_lo = 9999999; + + my %unwanted; + my %wanted; + my %wranks; + my %uranks; + my $rules = $self->{rules}; + + + foreach my $rule (values %{$self->{rules}}) { + + $wanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_ham} : $rule->{freq_spam}; + $unwanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_spam} : $rule->{freq_ham}; + + $wranks{$wanted{$rule->{name}}} = 1; + $uranks{$unwanted{$rule->{name}}} = 1; + + } + + my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted; + my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted; + + # first half of ranking is the wanted rank + my $position = 0; + my $last = undef; + + foreach my $test (@wanted) { + $position++ if defined $last && $last != $wanted{$test}; + $rules->{$test}->{rank} += $position; + $last = $wanted{$test}; + } + + # second half is the unwanted rank + $position = 0; + $last = undef; + my $normalize = (scalar keys %wranks) / (scalar keys %uranks); + + foreach my $test (@unwanted) { + $position++ if defined $last && $last != $unwanted{$test}; + $rules->{$test}->{rank} += ($position * $normalize); + $last = $unwanted{$test}; + $rank_hi = $rules->{$test}->{rank} if ($rules->{$test}->{rank} > $rank_hi); + $rank_lo = $rules->{$test}->{rank} if ($rules->{$test}->{rank} < $rank_lo); + } + + $rank_hi = $rank_hi - $rank_lo; + foreach my $rule (values %{$rules}) { + $rule->{rank} = ($rank_hi == 0) ? 0.001 : (($rule->{rank} - $rank_lo)/ $rank_hi); + } + + $self->{_rank} = 1; +} + +=item $parser->get_rules_array(); + +Returns a reference to an array of hash references. The values of +these hash have keys as listed above. + +=cut + +sub get_rules_array { + my $self = shift; + return [ values %{$self->{rules}} ]; +} + +=item $parser->get_rules_hash(); + +Returns a reference to a hash with rule names as keys and hash +references as values. The values of these hash have keys as listed +above. + +=cut + +sub get_rules_hash { + my $self = shift; + return $self->{rules}; +} + +=item $parser->get_logs(); + +Returns a reference to the array containing log entries, in the form +of anonymous hashes with keys as described above. + +=cut + +sub get_logs { + my $self = shift; + return $self->{logs}; +} + +=item $parser->get_num_ham(); + +Returns number of ham logs read. + +=cut + +sub get_num_ham { + my $self = shift; + return $self->{num_ham}; +} + +=item $parser->get_num_spam(); + +Returns number of spam logs read. + +=cut + +sub get_num_spam { + my $self = shift; + return $self->{num_spam}; +} + +=item $parser->do_score_ranges(); + +Figure out range in which score can be set based on the soratio, etc. + +This is necessary so that the perceptron doesn't set silly +scores. (This may not be as much of a problem as it was with the old +GA.) + +This adds the following keys to the rules hashes: + +=over 4 + +=item ismutable + +Determines whether the perceptron can select a score for this test. + +=item range_lo + +Determines the lowest score the perceptron can set. + +=item range_hi + +Determines the highest score the perceptron can set. + +=cut + +sub do_score_ranges() { + + my $self = shift; + + if ( !$self->{_statistics} ) { + $self->do_statistics(); + } + if ( !$self->{_rank} ) { + $self->do_rank(); + } + + foreach my $rule (values %{$self->{rules}}) { + + my ($rank, $lo, $hi); + + $rank = $rule->{rank}; + + # Get rid of rules that don't hit -- and disable completely. + if ($rule->{spam_percent} + $rule->{ham_percent} < 0.01 || + $rule->{score} == 0) { + + $rule->{ismutable} = 0; + $rule->{range_lo} = $rule->{range_hi} = 0; + next; + + } + + # next: get rid of tests that don't apply in this scoreset + # or are userconf -- set ismutable to 0, but keep the score + if ($rule->{tflags} =~ /\buserconf\b/ || + (($self->{scoreset} % 2) == 0 && $rule->{tflags} =~/\bnet\b/)) { + + $rule->{ismutable} = 0; + $rule->{range_lo} = $rule->{range_hi} = $rule->{score}; + next; + + } + + + # Normal rules: + + # This seems to convert from [-1,1] to [0,1] but we're already in + # [0,1] space - Is this right? + + # The current way ranks are calculated, > 0.5 and < 0.5 have no + # special meaning + +# # 0.0 = best nice, 1.0 = best nonnice +# if ($rule->{isnice}) { +# $rank = .5 - ($rank / 2); +# } else { +# $rank = .5 + ($rank / 2); +# } + + # ranks are really meant to be used as human readable - not as + # tools to choose score range - use soratio instead! + + if ($self->{use_sliding_window}) { + ($lo, $hi) = $self->sliding_window_ratio_to_range($rule->{soratio}); + } else { + ($lo, $hi) = $self->shrinking_window_ratio_to_range($rule->{soratio}); + } + + # Modify good rules to be lower + if ($rule->{isnice}) { + if ($rule->{tflags} =~ /\blearn\b/) { # learn rules should get + # higher scores (-5.4) + $lo *= 1.8; + } + elsif ( $rule->{soratio} <= 0.05 && $rule->{ham_percent} > 0.5) { + $lo *= 1.5; + } + + # argh, ugly... but i'm copying it whole... + $hi = ($rule->{soratio} == 0) ? $lo : + ($rule->{soratio} <= 0.005 ) ? $lo/1.1 : + ($rule->{soratio} <= 0.010 && $rule->{ham_percent} > 0.2) ? $lo/2.0 : + ($rule->{soratio} <= 0.025 && $rule->{ham_percent} > 1.5) ? $lo/10.0 : + 0; + + if ($rule->{soratio} >= 0.35 ) { + ($lo, $hi) = (0,0); + } + } + else { # Make non-nice rules have higher scores if they're good + if ($rule->{tflags} =~ /\blearn\b/ ) { + $hi *= 1.8; + } + elsif ( $rule->{soratio} >= 0.99 && $rule->{spam_percent} > 1.0) { + $hi *= 1.5; + } + + $lo = ($rule->{soratio} == 1) ? $hi: + ($rule->{soratio} >= 0.995 ) ? $hi/4.0 : + ($rule->{soratio} >= 0.990 && $rule->{spam_percent} > 1.0) ? $hi/8.0 : + ($rule->{soratio} >= 0.900 && $rule->{spam_percent} > 10.0) ? $hi/24.0 : + 0; + + if ($rule->{soratio} <= 0.65 ) { # auto-disable bad rules + ($lo, $hi) = (0,0); + } + } + + + # Some sanity checking + if($hi < $lo) { + ($lo, $hi) = ($hi, $lo); + } + + + $rule->{ismutable} = ($lo == $hi) ? 0 : 1; + $rule->{range_lo} = $lo; + $rule->{range_hi} = $hi; + + } +} + +sub sliding_window_ratio_to_range { + my ($self, $ratio) = @_; + + # (rough) graphic demo of this algorithm: + # 0.0 = -limit [......] 0 ........ limit + # 0.25 = -limit ..[..... 0 .]...... limit + # 0.5 = -limit ....[... 0 ...].... limit + # 0.75 = -limit ......[. 0 .....].. limit + # 1.0 = -limit ........ 0 [......] limit + my $sliding_window_limits = 4.8; # limits = [-$range, +$range] + my $sliding_window_size = 5.5; # scores have this range within limits + + my $lo = -$sliding_window_limits + ($sliding_window_size * $ratio); + my $hi = +$sliding_window_limits - ($sliding_window_size * (1-$ratio)); + if ($lo > $hi) { + ($lo,$hi) = ($hi,$lo); + } + ($lo, $hi); + +} +sub shrinking_window_ratio_to_range { + my ($self, $ratio) = @_; + + + # 0.0 = -limit [......] 0 ........ limit + # 0.25 = -limit ....[... 0 ]....... limit + # 0.5 = -limit ......[. 0 .]...... limit (note: tighter) + # 0.75 = -limit .......[ 0 ...].... limit + # 1.0 = -limit ........ 0 [......] limit + my $shrinking_window_lower_base = 0.00; + my $shrinking_window_lower_range = 1.00; # *ratio, added to above + my $shrinking_window_size_base = 1.00; + my $shrinking_window_size_range = 1.00; # *ratio, added to above + + my $is_nice = 0; + my $adjusted = ($ratio -.5) * 2; # adj [0,1] to [-1,1] + if ($adjusted < 0) { $is_nice = 1; $adjusted = -$adjusted; } + +#$adjusted /= 1.5 if ( $ratio < 0.95 && $ratio > 0.15 ); # tvd + + my $lower = $shrinking_window_lower_base + + ($shrinking_window_lower_range * $adjusted); + my $range = $shrinking_window_size_base + + ($shrinking_window_size_range * $adjusted); + my $lo = $lower; + my $hi = $lower + $range; + if ($is_nice) { + my $tmp = $hi; $hi = -$lo; $lo = -$tmp; + } + if ($lo > $hi) { + ($lo,$hi) = ($hi,$lo); + } + + ($lo, $hi); +} + + +# Pacify perl +1; Index: hit-frequencies =================================================================== --- hit-frequencies (revision 10703) +++ hit-frequencies (working copy) @@ -16,385 +16,252 @@ # limitations under the License. # + use FindBin; -use Getopt::Std; -getopts("fm:M:X:l:L:pxhc:at:s:i"); +use lib "$FindBin::Bin/../lib"; +use Mail::SpamAssassin::Masses; +use Getopt::Long qw(:config bundling auto_help); +use Pod::Usage; +use strict; +use warnings; + use vars qw { $opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c - $opt_a $opt_t $opt_s $opt_i $sorting + $opt_a $opt_t $opt_s $opt_z $opt_inclang $opt_auto }; -sub usage { - die "hit-frequencies [-c rules dir] [-f] [-m RE] [-M RE] [-X RE] [-l LC] - [-s SC] [-a] [-p] [-x] [-i] [spam log] [ham log] +GetOptions("c|cffile=s@" => \$opt_c, + "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode) + "l|logfile=s" => \$opt_l, + "f|falses" => \$opt_f, + "a|all" => \$opt_a, + "p|percentages" => \$opt_p, + "x|extended" => \$opt_x, + "m|matchrule=s" => \$opt_m, #, + "t|tflags=s" => \$opt_t, + "M|matchlog=s" => \$opt_M, + "X|excludelog=s" => \$opt_X, + "L|language=s" => \$opt_L, + "include-language=s" => \$opt_inclang); - -c p use p as the rules directory - -f falses. count only false-negative or false-positive matches - -m RE print rules matching regular expression - -t RE print rules with tflags matching regular expression - -M RE only consider log entries matching regular expression - -X RE don't consider log entries matching regular expression - -l LC also print language specific rules for lang code LC (or 'all') - -L LC only print language specific rules for lang code LC (or 'all') - -a display all tests - -p percentages. implies -x - -x extended output, with S/O ratio and scores - -s SC which scoreset to use - -i use IG (information gain) for ranking - options -l and -L are mutually exclusive. +=head1 NAME - options -M and -X are *not* mutually exclusive. +hit-frequencies - Display statistics about tests hit by a mass-check run - if either the spam or and ham logs are unspecified, the defaults - are \"spam.log\" and \"ham.log\" in the cwd. +=head1 SYNOPSIS -"; -} +hit-frequencies [options] -usage() if($opt_h || ($opt_l && $opt_L)); + Options: + -c,--cffile=path Use path as the rules directory + -s,--scoreset=n Use scoreset n + -l,--logfile=file Read in file instead of masses.log + -f Count only false-positives/false-negatives + -a Report all tests (including subrules) + -p Report percentages instead of raw hits + -x "Extended" output, include RANK, S/O and SCORE + -m,--matchrule=re Print rules matching the regular expression + -t,--tflags=re Print only rules with tflags matching the regular expression + -M,--matchlog=re Consider only logs matching the regular expression + -X,--excludelog=re Exclude logs matching this regular expression + -L,--language=lc Only print language specific tests for specified lang code (try 'all') + --include-language=lc Also print language specific tests for specified lang code (try 'all') -if ($opt_p) { - $opt_x = 1; -} +=head1 DESCRIPTION -$opt_s = 0 if ( !defined $opt_s ); +B will read the mass-check log F or the +log given by the B<--logfile> option. The output will contain a +summary of the number of ham and spam messages and detailed statistics +for each rule. By default, B will try to guess the +proper values for B<--cffile> based on the header of the +masses.log. The output will include the following columns: -my $cffile = $opt_c || "$FindBin::Bin/../rules"; +=over 4 -my %freq_spam = (); -my %freq_ham = (); -my $num_spam = 0; -my $num_ham = 0; -my %ranking = (); -my $ok_lang = ''; +=item OVERALL -readscores($cffile); +Number of times (or percentage with B<-p>) the rule hit on +all messages (spam or ham). -$ok_lang = lc ($opt_l || $opt_L || ''); -if ($ok_lang eq 'all') { $ok_lang = '.'; } +=item SPAM -foreach my $key (keys %rules) { +Number of times (or percentage with B<-p>) the rule hit on +spam messages. - if ( ($opt_L && !$rules{$key}->{lang}) || - ($rules{$key}->{lang} && - (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i) - ) ) { - delete $rules{$key} ; next; - } +=item HAM - $freq_spam{$key} = 0; - $freq_ham{$key} = 0; -} +Number of times (or percentage with B<-p>) the rule hit on +ham messages. -readlogs(); +=item S/O -my $hdr_all = $num_spam + $num_ham; -my $hdr_spam = $num_spam; -my $hdr_ham = $num_ham; +Shown only with B<-x> or B<-p>, this is the number of spam hits +divided by total number of hits (C refers to spam divided by +overall). -if ($opt_p) { - my $sorting = $opt_i ? "IG" : "RANK"; - if ($opt_f) { - printf "%7s %7s %7s %6s %6s %6s %s\n", - "OVERALL%", "FNEG%", "FPOS%", "S/O", $sorting, "SCORE", "NAME"; - } else { - printf "%7s %7s %7s %6s %6s %6s %s\n", - "OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME"; - } - printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", - $hdr_all, $hdr_spam, $hdr_ham, - soratio ($num_spam,$num_ham), 0, 0; +=item RANK - $hdr_spam = ($num_spam / $hdr_all) * 100.0; - $hdr_ham = ($num_ham / $hdr_all) * 100.0; - $hdr_all = 100.0; # this is obvious - printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n", - $hdr_all, $hdr_spam, $hdr_ham, - soratio ($num_spam,$num_ham), 0, 0; +Shown only with B<-x> or B<-p>, this is a measure that attempts to +indicate how I or I a test is. The higher it is, the +better the test. -} elsif ($opt_x) { - printf "%7s %7s %7s %6s %6s %6s %s\n", - "OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME"; - printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", - $hdr_all, $hdr_spam, $hdr_ham, - soratio ($num_spam,$num_ham), 0, 0; +=item SCORE -} else { - printf "%10s %10s %10s %s\n", - "OVERALL", "SPAM", "HAM", "NAME"; - printf "%10d %10d %10d (all messages)\n", - $hdr_all, $hdr_spam, $hdr_ham; -} +Shown only with B<-x> or B<-p>, this is the current score assigned to +the rule. -my %done = (); -my @tests = (); -my $rank_hi = 0; -my $rank_lo = 9999999; +=item NAME -# variables for wanted/unwanted RANK -my %wanted; -my %unwanted; -my %wranks; -my %uranks; +This is the rule's name. -foreach my $test (keys %freq_spam, keys %freq_ham) { - next unless (exists $rules{$test}); # only valid tests - next if (!$opt_a && $rules{$test}->{issubrule}); +=back - next if $done{$test}; $done{$test} = 1; - push (@tests, $test); +=head1 BUGS - my $isnice = 0; - if ($rules{$test}->{tflags} =~ /nice/) { $isnice = 1; } +Please report bugs to http://bugzilla.spamassassin.org/ - my $fs = $freq_spam{$test}; $fs ||= 0; - my $fn = $freq_ham{$test}; $fn ||= 0; - my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0; - my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0; +=head1 SEE ALSO - my $soratio = $soratio{$test} = soratio ($fsadj, $fnadj); +L, L, L - if ($isnice) { - $soratio = 1.0 - $soratio; - my $tmp = $fsadj; $fsadj = $fnadj; $fnadj = $tmp; - } +=cut - if ($opt_i) { - # come up with a ranking - my $rank; - - # New new system: from "Learning to Filter Unsolicited Commercial E-Mail", - # Ion Androutsopoulos et al: determine the information gain IG(X, C) of the - # Boolean attributes (ie. the rules). Measures "the average reduction in - # the entropy of C (classification) given the value of X (the rule)". Makes - # a good ranking measure with a proper statistical basis. ;) - # - # Still would like to get an entropy measure in, too. - # - # sum P(X = x ^ C = c) - # IG(X,C) = x in [0, 1] P(X = x ^ C = c) . log2( ------------------- ) - # c in [Ch, Cs] P(X = x) . P(C = c) - # - my $safe_nspam = $num_spam || 0.0000001; - my $safe_nham = $num_ham || 0.0000001; - - my $num_all = ($num_spam + $num_ham); - my $safe_all = $num_all || 0.0000001; - my $f_all = $fs+$fn; - - my $px0 = (($num_all - $f_all) / $safe_all); # P(X = 0) - my $px1 = ($f_all / $safe_all); # P(X = 1) - my $pccs = ($num_spam / $safe_all); # P(C = Cs) - my $pcch = ($num_ham / $safe_all); # P(C = Ch) - my $px1ccs = ($fs / $safe_nspam); # P(X = 1 ^ C = Cs) - my $px1cch = ($fn / $safe_nham); # P(X = 1 ^ C = Ch) - my $px0ccs = (($num_spam - $fs) / $safe_nspam); # P(X = 0 ^ C = Cs) - my $px0cch = (($num_ham - $fn) / $safe_nham); # P(X = 0 ^ C = Ch) - my $safe_px0_dot_pccs = ($px0 * $pccs) || 0.00000001; - my $safe_px0_dot_pcch = ($px0 * $pcch) || 0.00000001; - my $safe_px1_dot_pccs = ($px1 * $pccs) || 0.00000001; - my $safe_px1_dot_pcch = ($px1 * $pcch) || 0.00000001; - - sub log2 { return log($_[0]) / 0.693147180559945; } # log(2) = 0.6931... - - my $safe_px0ccs = ($px0ccs || 0.0000001); - my $safe_px0cch = ($px0cch || 0.0000001); - my $safe_px1ccs = ($px1ccs || 0.0000001); - my $safe_px1cch = ($px1cch || 0.0000001); - $rank = ( $px0ccs * log2($safe_px0ccs / $safe_px0_dot_pccs) ) + - ( $px0cch * log2($safe_px0cch / $safe_px0_dot_pcch) ) + - ( $px1ccs * log2($safe_px1ccs / $safe_px1_dot_pccs) ) + - ( $px1cch * log2($safe_px1cch / $safe_px1_dot_pcch) ); - - $ranking{$test} = $rank; - $rank_hi = $rank if ($rank > $rank_hi); - $rank_lo = $rank if ($rank < $rank_lo); - } - else { - # basic wanted/unwanted ranking - $wanted{$test} = $isnice ? $fn : $fs; - $unwanted{$test} = $isnice ? $fs : $fn; - # count number of ranks of each type - $wranks{$wanted{$test}} = 1; - $uranks{$unwanted{$test}} = 1; - } +if ($opt_L && $opt_inclang) { + pod2usage("-L/--language and --include-language are mutually exclusive"); } -# finish basic wanted/unwanted ranking -if (! $opt_i) { - my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted; - my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted; - - # first half of ranking is the wanted rank - my $position = 0; - my $last = undef; - for my $test (@wanted) { - $position++ if defined $last && $last != $wanted{$test}; - $ranking{$test} += $position; - $last = $wanted{$test} - } - - # second half of ranking is the unwanted rank - my $normalize = (scalar keys %wranks) / (scalar keys %uranks); - $position = 0; - $last = undef; - for my $test (@unwanted) { - $position++ if defined $last && $last != $unwanted{$test}; - $ranking{$test} += ($position * $normalize); - $last = $unwanted{$test}; - $rank_hi = $ranking{$test} if ($ranking{$test} > $rank_hi); - $rank_lo = $ranking{$test} if ($ranking{$test} < $rank_lo); - } +if ($opt_p) { + $opt_x = 1; } -{ - # now normalise the rankings to [0, 1] - $rank_hi -= $rank_lo; - foreach $test (@tests) { - $ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi); - } -} +$opt_s = 0 if ( !defined $opt_s ); -foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) { - next unless (exists $rules{$test}); # only valid tests - next if (!$opt_a && $rules{$test}->{issubrule}); +my $ok_lang = lc ( $opt_inclang || $opt_L || ''); +$ok_lang = '.' if ($ok_lang eq 'all'); - my $fs = $freq_spam{$test}; $fs ||= 0; - my $fn = $freq_ham{$test}; $fn ||= 0; - my $fa = $fs+$fn; +my $greprules = sub { # To determine whether rule should be read + my ($name, $rule) = @_; - next if ($opt_m && $test !~ m/$opt_m/); # match certain tests - next if ($opt_t && $rules{$test}->{tflags} !~ /$opt_t/); # match tflags + return 0 if ($opt_m && $name !~ /$opt_m/); # name doesn't match -m + # expression + return 0 if ($opt_t && $rule->{tflags} !~ /$opt_t/); # tflags don't + # match -t + # expression + return 0 if (($opt_L && !$rule->{lang}) || + ($rule->{lang} && + (!$ok_lang || $rule->{lang} !~ /^$ok_lang/i))); # Wrong language + return 0 if ($rule->{issubrule} && !$opt_a); + if (!$opt_a && !$opt_t) { - next if ($rules{$test}->{tflags} =~ /net/ && ($opt_s % 2 == 0)); # not net tests - next if ($rules{$test}->{tflags} =~ /userconf/); # or userconf + return 0 if ($rule->{tflags} =~ /net/ && ($opt_s % 2 == 0)); + return 0 if ($rule->{tflags} =~ /userconf/); # or userconf } + return 1; - # adjust based on corpora sizes (and cvt to % while we're at it) - my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0; - my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0; +}; - if ($opt_f && $fsadj == 0 && $fnadj == 0) { next; } - if ($opt_p) { - $fa = ($fa / ($num_spam + $num_ham)) * 100.0; - $fs = $fsadj; - $fn = $fnadj; - } +my $logfile = $opt_l || "masses.log"; - my $soratio = $soratio{$test}; - if (!defined $soratio) { - $soratio{$test} = soratio ($fsadj, $fnadj); - } +if (!$opt_c || !scalar(@$opt_c)) { + # Try to read this in from the log, if possible + open IN, $logfile or die "Can't open $logfile: $!"; + my $files = 0; # are we in the files section? + while() { + if (!$files) { + if (/^\# SVN revision:/) { + $opt_c = [ "$FindBin::Bin/../rules" ]; + last; + } elsif (/^\# Using configuration:$/) { + $files = 1; + } + } elsif (/^\#\s+(.*)\s*$/) { + push (@$opt_c, $1); + } else { + # All done! + last; + } + } - if ($opt_p) { - printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f %s\n", - $fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test; - - } elsif ($opt_x) { - printf "%7d %7d %7d %7.3f %6.2f %6.2f %s\n", - $fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test; - - } else { - printf "%10d %10d %10d %s\n", $fa, $fs, $fn, $test; - } + foreach my $file (@$opt_c) { + die "Can't read $file" unless -r $file; + } } -exit; + +my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c, + scoreset => $opt_s, + falsesonly => $opt_f, + greprules => $greprules, + logfile => $logfile, + nologs => 1}); +$masses->readrules(); +$masses->readlogs(); +$masses->do_statistics(); +$masses->do_rank(); +my $rules = $masses->get_rules_hash(); +my $num_ham = $masses->get_num_ham(); +my $num_spam = $masses->get_num_spam(); +my $num_all = $num_ham + $num_spam; -sub readlogs { - my $spam = $ARGV[0] || "spam.log"; - my $ham = $ARGV[1] || (-f "good.log" ? "good.log" : "ham.log"); +if ($num_ham + $num_spam <= 0) { + die "Can't run hit-frequencies on 0 messages."; +} - foreach my $file ($spam, $ham) { - open (IN, "<$file") || die "Could not open file '$file': $!"; +## Write header - my $isspam = 0; ($file eq $spam) and $isspam = 1; +if ($opt_p) { - while () { - next if (/^#/); - next unless (!$opt_M || /$opt_M/o); - next if ($opt_X && /$opt_X/o); + if ($opt_f) { + printf "%7s %7s %7s %6s %6s %6s %s\n", + "OVERALL%", "FNEG%", "FPOS%", "S/O", "RANK", "SCORE", "NAME"; + } else { + printf "%7s %7s %7s %6s %6s %6s %s\n", + "OVERALL%", "SPAM%", "HAM%", "S/O", "RANK", "SCORE", "NAME"; + } - /^(.)\s+(-?\d+)\s+(\S+)\s*(\S*)/ or next; - my $caught = ($1 eq 'Y'); - my $hits = $2; - $_ = $4; s/,,+/,/g; + printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", + $num_all, $num_spam, $num_ham, + $num_spam / $num_all, 0, 0; - if ($isspam) { - if ($opt_f) { - if (!$caught) { $num_spam++; } - } else { - $num_spam++; - } - } else { - if ($opt_f) { - if ($caught) { $num_ham++; } - } else { - $num_ham++; - } - } + printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n", + 100.0, $num_spam / $num_all * 100.0, $num_ham / $num_all * 100.0, + $num_spam / $num_all, 0, 0; - my @tests = split (/,/, $_); - foreach my $t (@tests) { - next if ($t eq ''); - if ($isspam) { - if ($opt_f) { - if (!$caught) { $freq_spam{$t}++; } - } else { - $freq_spam{$t}++; - } - } else { - if ($opt_f) { - if ($caught) { $freq_ham{$t}++; } - } else { - $freq_ham{$t}++; - } - } - } - } - close IN; - } -} +} elsif ($opt_x) { + printf "%7s %7s %7s %6s %6s %6s %s\n", + "OVERALL", "SPAM", "HAM", "S/O", "RANK", "SCORE", "NAME"; + printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", + $num_all, $num_spam, $num_ham, + $num_spam / $num_all, 0, 0; - -sub readscores { - my($cffile) = @_; - system ("$FindBin::Bin/parse-rules-for-masses -d \"$cffile\" -s $opt_s") and die; - require "./tmp/rules.pl"; +} else { + printf "%10s %10s %10s %s\n", + "OVERALL", "SPAM", "HAM", "NAME"; + printf "%10d %10d %10d (all messages)\n", + $num_all, $num_spam, $num_ham; } -sub soratio { - my ($s, $n) = @_; +foreach my $test (sort { $rules->{$b}->{rank} <=> $rules->{$a}->{rank} } keys %{$rules}) { - $s ||= 0; - $n ||= 0; - - if ($s + $n > 0) { - return $s / ($s + $n); + if ($opt_p) { + printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f %s\n", + ($rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham}) / $num_all * 100.0, + $rules->{$test}->{spam_percent}, $rules->{$test}->{ham_percent}, + $rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test; + } elsif ($opt_x) { + printf "%7d %7d %7d %7.3f %6.2f %6.2f %s\n", + $rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham}, + $rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham}, + $rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test; } else { - return 0.5; # no results -> not effective + printf "%10d %10d %10d %s\n", + $rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham}, + $rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham}, $test; } } -sub tcr { - my ($nspam, $nlegit, $nspamspam, $nlegitspam) = @_; - my $nspamlegit = $nspam - $nspamspam; - my $nlegitlegit = $nlegit - $nlegitspam; - - my $lambda = 99; - - my $werr = ($lambda * $nlegitspam + $nspamlegit) - / ($lambda * $nlegit + $nspam); - - my $werr_base = $nspam - / ($lambda * $nlegit + $nspam); - - $werr ||= 0.000001; # avoid / by 0 - my $tcr = $werr_base / $werr; - return $tcr; -} Index: rewrite-cf-with-new-scores =================================================================== --- rewrite-cf-with-new-scores (revision 10703) +++ rewrite-cf-with-new-scores (working copy) @@ -16,32 +16,119 @@ # limitations under the License. # +=head1 NAME + +rewrite-cf-with-new-scores - Rewrite SpamAssassin scores file with new +scores. + +=head1 SYNOPSIS + +rewrite-cf-with-new-scores [options] + + Options + --old-scores=file Read file containing the old SpamAssassin scores + --new-scores=file Read file containing the new SpamAssassin scores + -s,--scoreset n Rewrite scoreset n + --output=file Output rewritten score file to file + -c,--cffile=path Use path as the rules directory + -l,--logfile=file Use file instead of masses.log (for guessing -c) + + Note: these options can be shortened (i.e. --old, --new, --out) as + long as they are unambiguous. + +=head1 DESCRIPTION + +B is a tool to update the sitewide scores +file with the newly generated scores. Since SpamAssassin has four +different scoresets, which each need to be generated separately, this +tool is used to only change the correct scoreset. + +By default, the old scores are read from 50_scores.cf in the rules +directory and the new ones from ./perceptron.scores. The output will +be ./50_scores.cf by default. + +The rules directory needs to be used to make sure scores are given for +the right tests. Rules not found in the rules directory will not be +given scores in the output. + +=head1 BUGS + +Please report bugs to http://bugzilla.spamassassin.org/ + +=head1 SEE ALSO + +L, L, L + +=cut + +use FindBin; +use lib "$FindBin::Bin/../lib"; +use Getopt::Long qw(:config bundling auto_help); +use Mail::SpamAssassin::Masses; +use Pod::Usage; +use strict; +use warnings; + +use vars qw($opt_old $opt_new $opt_scoreset $opt_out $opt_c $opt_l); + +GetOptions("old-scores=s" => \$opt_old, + "new-scores=s" => \$opt_new, + "s|scoreset=i" => \$opt_scoreset, + "output=s" => \$opt_out, + "c|cffile=s@" => \$opt_c, + "l|logfile=s" => \$opt_l); + +$opt_l ||= "masses.log"; +$opt_scoreset = 0 unless defined $opt_scoreset; + my $NUM_SCORESETS = 4; -my ($scoreset,$oldscores,$newscores) = @ARGV; +if (!$opt_c || !scalar(@$opt_c)) { + # Try to read this in from the log, if possible + open IN, $opt_l or die "Can't open $opt_l: $!"; + my $files = 0; # are we in the files section? + while() { + if (!$files) { + if (/^\# SVN revision:/) { + $opt_c = [ "$FindBin::Bin/../rules" ]; + last; + } elsif (/^\# Using configuration:$/) { + $files = 1; + } + } elsif (/^\#\s+(.*)\s*$/) { + push (@$opt_c, $1); + } else { + # All done! + last; + } + } -$scoreset = int($scoreset) if defined $scoreset; -if (!defined $newscores || $scoreset < 0 || $scoreset >= $NUM_SCORESETS ) { - die "usage: rewrite-cf-with-new-scores scoreset oldscores.cf newscores.cf\n"; + foreach my $file (@$opt_c) { + die "Can't read $file" unless -r $file; + } } -system ("./parse-rules-for-masses -s $scoreset") and die; -if (-e "tmp/rules.pl") { - # Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem - require "./tmp/rules.pl"; +if (!$opt_old) { + $opt_old = $$opt_c[0] . "/50_scores.cf"; } -else { - die "parse-rules-for-masses had no error but no tmp/rules.pl!?!"; -} +$opt_new ||= "50_scores.cf"; + +my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c, + scoreset => $opt_scoreset}); + +$masses->readrules(); +my $rules = $masses->get_rules_hash(); + # now read the generated scores my @gascoreorder = (); +my %oldscores = (); my %gascorelines = (); -open (STDIN, "<$newscores") or die "cannot open $newscores"; +open (STDIN, "<$opt_new") or die "cannot open $opt_new"; while () { /^score\s+(\S+)\s+(-?\d+(?:\.\d+)?)/ or next; my $name = $1; my $score = $2; - next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0); + next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule}); next if ($name =~ /^__/); next if ($name eq '(null)'); # er, oops ;) @@ -49,7 +136,7 @@ push (@gascoreorder, $name); } -open (IN, "<$oldscores") or die "cannot open $oldscores"; +open (IN, "<$opt_old") or die "cannot open $opt_old"; my $out = ''; my $pre = ''; @@ -58,7 +145,7 @@ while () { if (/^\s*score\s+(\S+)\s/) { delete $gascorelines{$1}; - next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0); + next unless (exists ($rules->{$1}) && $rules->{$1}->{issubrule} == 0); } $pre .= $_; /^# Start of generated scores/ and last; @@ -82,10 +169,10 @@ if (/^\s*score\s+\S+/) { my($score,$name,@scores) = split; - next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0); + next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule}); if (defined $gascorelines{$name}) { # Set appropriate scoreset value - $scores[$scoreset] = $gascorelines{$name}; + $scores[$opt_scoreset] = $gascorelines{$name}; # Create new score line $_ = join(" ","score",$name,generate_scores(@scores))."\n"; @@ -96,8 +183,10 @@ } close IN; +open OUT, ">$opt_out" or die "Can't open $opt_out: $!"; + # and output the lot -print $pre, "\n"; +print OUT $pre, "\n"; foreach my $name (@gascoreorder) { $_ = $gascorelines{$name}; next unless (defined ($_)); @@ -107,12 +196,12 @@ @scores = @{$oldscores{$name}} if ( exists $oldscores{$name} ); # Set appropriate scoreset value - $scores[$scoreset] = $_; + $scores[$opt_scoreset] = $_; # Create new score line - print join(" ","score",$name,generate_scores(@scores)),"\n"; + print OUT join(" ","score",$name,generate_scores(@scores)),"\n"; } -print "\n", $out, "\n"; +print OUT "\n", $out, "\n"; sub generate_scores { my (@scores) = @_; Index: Makefile =================================================================== --- Makefile (revision 10703) +++ Makefile (working copy) @@ -4,33 +4,26 @@ # What rule scoreset are we using? SCORESET = 0 +LOGFILE = masses.log #### Should be no need to modify below this line all: badrules perceptron perceptron: perceptron.o - $(CC) -o perceptron perceptron.o $(LDFLAGS) + $(CC) -o perceptron perceptron.o $(LDFLAGS) -perceptron.o: tmp/rules.pl tmp/tests.h tmp/scores.h +perceptron.o: tmp/tests.h $(CC) $(CFLAGS) -c -o perceptron.o perceptron.c -tmp/rules.pl: tmp/.created parse-rules-for-masses - perl parse-rules-for-masses -d ../rules -s $(SCORESET) +tmp/tests.h: tmp/.created logs-to-c + perl logs-to-c --scoreset=$(SCORESET) --logfile=$(LOGFILE) -tmp/tests.h: tmp/.created tmp/ranges.data logs-to-c - perl logs-to-c --scoreset=$(SCORESET) +freqs: masses.log + perl hit-frequencies -x -p -s $(SCORESET) --logfile=$(LOGFILE) > freqs -tmp/scores.h: tmp/tests.h - -tmp/ranges.data: tmp/.created freqs score-ranges-from-freqs - perl score-ranges-from-freqs ../rules $(SCORESET) < freqs - -freqs: spam.log ham.log - perl hit-frequencies -x -p -s $(SCORESET) > freqs - badrules: freqs - perl lint-rules-from-freqs < freqs > badrules + perl lint-rules-from-freqs -s $(SCORESET) --logfile=$(LOGFILE) > badrules tmp/.created: -mkdir tmp Index: lint-rules-from-freqs =================================================================== --- lint-rules-from-freqs (revision 10703) +++ lint-rules-from-freqs (working copy) @@ -16,124 +16,226 @@ # limitations under the License. # +=head1 NAME + +lint-rules-from-freqs - Try to find problems with SpamAssassin rules + +=head1 SYNOPSIS + +lint-rules-from-freqs [options] + + Options: + -c,--cffile=path Use path as the rules directory + -s,--scoreset=n Use scoreset n + -l,--logfile=file Read in file instead of masses.log + -f Also take into account false positives/negatives + +=head1 DESCRIPTION + +This script analyzes SpamAssassin tests, based on the hit frequencies +and S/O ratios from a mass-check log (masses.log). This script can +also optionally take into account the false positive/negative +frequencies. + +The script first uses the SpamAssassin rules parser to report on any +illegal syntax. Then it checks the rules match frequencies from the +mass-check log in order to determine how effective the rule is. + +=head1 BUGS + +Please report bugs to http://bugzilla.spamassassin.org/ + +=head1 SEE ALSO + +L, L, L + +=cut + + +use FindBin; +use lib "$FindBin::Bin/../lib"; +use Mail::SpamAssassin::Masses; +use Mail::SpamAssassin; +use Getopt::Long qw(:config bundling auto_help); +use strict; +use warnings; + # any tests that get less than this % of matches on *both* spam or nonspam, are # reported. my $LOW_MATCHES_PERCENT = 0.03; -my $scoreset = 0; +use vars qw($opt_c $opt_l $opt_s $opt_f $opt_p); + +GetOptions("c|cffile=s@" => \$opt_c, + "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode) + "l|logfile=s" => \$opt_l, + "f|falses" => \$opt_f); + + +$opt_s = 0 unless defined $opt_s; +$opt_l ||= "masses.log"; + sub usage { die " lint-rules-from-freqs: perform 'lint' testing on SpamAssassin rules and scores -usage: ./lint-rules-from-freqs [-f falsefreqs] < freqs > badtests +usage: ./lint-rules-from-freqs [-c rules dir] [-l logfile] [-s scoreset] [-f] -This analyzes SpamAssassin tests, based on the hit frequencies and S/O ratios -from a mass-check logfile pair. + -c p Use p as rules directory + -l f Use f as log fine + -s n Use n as score set + -f Check also for false positives/negatives -The 'freqs' argument is the frequency of hits in all messages ('hit-frequencies --x -p' output). -The 'falsefreqs' argument is frequencies of hits in false-positives and -false-negatives only ('hit-frequencies -x -p -f' output). - "; } -my $opt_falsefreqs; -while ($#ARGV >= 0) { - $_ = shift @ARGV; - if (/^-f/) { $_ = shift @ARGV; $opt_falsefreqs = $_; } - elsif (/^-s/) { $_ = shift @ARGV; $scoreset = $_; } - else { usage(); } + +if (!$opt_c || !scalar(@$opt_c)) { + # Try to read this in from the log, if possible + open IN, $opt_l or die "Can't open $opt_l: $!"; + my $files = 0; # are we in the files section? + while() { + if (!$files) { + if (/^\# SVN revision:/) { + $opt_c = [ "$FindBin::Bin/../rules" ]; + last; + } elsif (/^\# Using configuration:$/) { + $files = 1; + } + } elsif (/^\#\s+(.*)\s*$/) { + push (@$opt_c, $1); + } else { + # All done! + last; + } + } + + foreach my $file (@$opt_c) { + die "Can't read $file" unless -r $file; + } } print "BAD TESTS REPORT\n"; -readrules(); -print "\n" .((scalar keys %rulefile) + 1). " rules found.\n"; +# First, do a --lint + print "\nRule file syntax issues:\n\n"; -lintrules(); -if ($opt_falsefreqs) { - open (FALSE, "<$opt_falsefreqs"); - while () { - if (!/^\s*([\d\.]+)/) { - my ($overall, $spam, $nons, $so, $score, $name) = split (' '); - next unless ($name =~ /\S/); - $falsefreqs_spam{$name} = $spam; - $falsefreqs_nons{$name} = $nons; - $falsefreqs_so{$name} = $so; +{ + local (*STDERR) = \*STDOUT; # Get lint errors on STDOUT + + # Read the config ourselves... + + # Read init.pre from each directory, then glob for the rest. + + my $cf_txt = ''; + my @files; + my @dirs; + foreach my $file (@$opt_c) { + if (-d $file) { + if (-r "$file/init.pre") { + push @files, "$file/init.pre"; + } + push @dirs, $file; } + else { + push @files, $file; + } } - close FALSE; -} + foreach my $dir (@dirs) { + my @cfs = glob("$dir/*.cf"); + push @files, grep { -r $_ } @cfs; + } -while (<>) { - if (!/^\s*([\d\.]+)/) { - $output{'a_header'} = $_; next; + foreach my $file (@files) { + if (-r $file) { + open IN, $file; + $cf_txt .= "file start $file\n"; + $cf_txt .= join('', ); + $cf_txt .= "\nfile end $file\n"; + close IN; + } } + my $spamtest = new Mail::SpamAssassin({config_text => $cf_txt}); + + $spamtest->lint_rules(); +} + + +# Next, check for other stuff +my $masses = Mail::SpamAssassin::Masses->new({rulesdir => $opt_c, + scoreset => $opt_s, #,, + falses => $opt_f, + logfile => $opt_l}); + +$masses->readlogs(); +$masses->do_statistics(); + +my $rules = $masses->get_rules_array(); + + +my %output; + +foreach my $rule (@$rules) { + my $badrule; - my ($overall, $spam, $nons, $so, $score, $name) = split (' '); - next unless ($name =~ /\S/); - my $ffspam = $falsefreqs_spam{$name}; - my $ffnons = $falsefreqs_nons{$name}; - my $ffso = $falsefreqs_so{$name}; + next if ($rule->{tflags} =~ /\bnet\b/ && ($opt_s % 2) == 0); + next if ($rule->{tflags} =~ /\buserconf\b/); - my $tf = $tflags{$name}; - next if ($tf =~ /net/ && ($scoreset % 2) == 0); - next if ($tf =~ /userconf/); + if ($rule->{freq_spam} == 0 && $rule->{freq_ham} == 0) { # sanity! - if ($overall == 0.0 && $spam == 0.0 && $nons == 0.0) { # sanity! $badrule = 'no matches'; } else { - if ($score < 0.0) { + if ($rule->{score} < 0.0) { # negative score with more spams than nonspams? bad rule. - if ($tf !~ /nice/ && $so > 0.5 && $score < 0.5) { + if (!$rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) { $badrule = 'non-nice but -ve score'; } - - if ($tf =~ /nice/ && $so > 0.5 && $score < 0.5) { - if ($ffso < 0.5) { + if ($rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) { + if ($opt_f && $rule->{freq_fn} < $rule->{freq_fp}) { $badrule = 'fn'; - } else { - # ignore, the FNs are overridden by other tests so it doesn't - # affect the overall results. } + # else { + # ignore, the FNs are overridden by other tests so it doesn't + # affect the overall results. + # } } # low number of matches overall - if ($nons < $LOW_MATCHES_PERCENT) + if ($rule->{ham_percent} < $LOW_MATCHES_PERCENT) { $badrule ||= ''; $badrule .= ', low matches'; } - } elsif ($score > 0.0) { + } elsif ($rule->{score} > 0.0) { # positive score with more nonspams than spams? bad. - if ($tf =~ /nice/ && $so < 0.5 && $score > 0.5) { + if ($rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) { $badrule = 'nice but +ve score'; } - - if ($tf !~ /nice/ && $so < 0.5 && $score > 0.5) { - if ($ffso > 0.5) { + + if (!$rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) { + if ($opt_f && $rule->{freq_fp} > $rule->{freq_fn}) { $badrule = 'fp'; - } else { - # ignore, the FPs are overridden by other tests so it doesn't - # affect the overall results. } + # else { + # ignore, the FPs are overridden by other tests so it doesn't + # affect the overall results. + # } } - + # low number of matches overall - if ($spam < $LOW_MATCHES_PERCENT) + if ($rule->{spam_percent} < $LOW_MATCHES_PERCENT) { $badrule ||= ''; $badrule .= ', low matches'; } - - } elsif ($score == 0.0) { + + } elsif ($rule->{score} == 0.0) { $badrule = 'score is 0'; } } - + if (defined $badrule) { - $badrule =~ s/^, //; chomp; - $output{$badrule} .= $_ . " ($badrule)\n"; + $badrule =~ s/^, //; + $output{$badrule} .= $rule->{name} . " ($badrule)\n"; } } @@ -156,182 +258,3 @@ exit; -sub concat_rule_lang { - my $rule = shift; - my $lang = shift; - - if (defined $lang && $lang ne '') { - return "[$lang]_$rule"; - } else { - return $rule; - } -} - -# note: do not use parse-rules-for-masses here, we need to do linting instead -# of your average parse -sub readrules { - my @files = <../rules/[0-9]*.cf>; - my $file; - %rulesfound = (); - %langs = (); - foreach $file (@files) { - open (IN, "<$file"); - while () { - s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/; - - # make all the foo-bar stuff foo_bar - 1 while s/^(\S+)-/\1_/g; - 1 while s/^(lang\s+\S+\s+\S+)-/\1_/g; - - my $lang = ''; - if (s/^lang\s+(\S+)\s+//) { - $lang = $1; $langs{$1} = 1; - } - - if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) { - $rulesfound{$2} = 1; - $rulefile{$2} ||= $file; - $scorefile{$1} = $file; - $score{$2} ||= 1.0; - $tflags{$2} ||= ''; - $descfile{$2} ||= $file; # a rule with no score or desc is OK - $description{$2}->{$lang} = undef; - - if (/^body\s+\S+\s+eval:/) { - # ignored - } elsif (/^body\s+\S+\s+(.*)$/) { - my $re = $1; - - # If there's a ( in a rule where it should be (?:, flag it. - # but ignore [abc(] ... - if ($re =~ /[^\\]\([^\?]/ && $re !~ /\[[^\]]*[^\\]\(/) { - print "warning: non-(?:...) capture in regexp in $file: $_\n"; - } - if ($re =~ /\.[\*\+]/) { - print "warning: .* in regexp in $file: $_\n"; - } - if ($re =~ /[^\\]\{(\d*),?(\d*?)\}/) { - if ($1 > 120 || $2 > 120) { - print "warning: long .{n} in regexp in $file: $_\n"; - } - } - } - - } elsif (/^describe\s+(\S+)\s+(.*?)\s*$/) { - $rulesfound{$1} = 1; - $descfile{concat_rule_lang ($1, $lang)} ||= $file; - $descfile{$1} ||= $file; - $description{$1}->{$lang} = $2; - } elsif (/^tflags\s+(\S+)\s+(.+)$/) { - $rulesfound{$1} = 1; - $tflags{$1} = $2; - $tflagsfile{concat_rule_lang ($1, $lang)} = $file; - $tflagsfile{$1} = $file; - } elsif (/^score\s+(\S+)\s+(.+)$/) { - $rulesfound{$1} = 1; - $scorefile{concat_rule_lang ($1, $lang)} = $file; - $scorefile{$1} = $file; - $score{$1} = $2; - } elsif (/^(clear_report_template|clear_spamtrap_template|report|spamtrap| - clear_terse_report_template|terse_report| - required_score|ok_locales|ok_languages|test|lang| - spamphrase|whitelist_from|require_version| - clear_unsafe_report_template|unsafe_report| - (?:bayes_)?auto_learn_threshold_nonspam|(?:bayes_)?auto_learn_threshold_spam| - (?:bayes_)?auto_learn - )/x) { - next; - } else { - print "warning: unknown rule in $file: $_\n"; - } - } - close IN; - } - @langsfound = sort keys %langs; - @rulesfound = sort keys %rulesfound; -} - -sub lintrules { - my %possible_renames = (); - - foreach my $rule (@rulesfound) { - my $match = $rule; - $match =~ s/_\d+[^_]+$//gs; # trim e.g. "_20K" - $match =~ s/[^A-Z]+//gs; # trim numbers etc. - - if (defined ($rulefile{$rule}) && $possible_renames{$match} !~ / \Q$rule\E\b/) { - $possible_renames{$match} .= " ".$rule; - } - $possible_rename_matches{$rule} = $match; - } - - foreach my $lang ('', @langsfound) { - foreach my $baserule (@rulesfound) { - next if ( $baserule =~ /^__/ || $baserule =~ /^T_/ ); - - my $rule = concat_rule_lang ($baserule, $lang); - my $f = $descfile{$rule}; - my $warned = ''; - - if (defined $f && !defined ($rulefile{$rule}) - && !defined ($rulefile{$baserule})) - { - print "warning: $baserule has description, but no rule: $f\n"; - $warned .= ' lamedesc'; - } - - # Check our convention for rule length - if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && length $baserule > 22 ) { - print "warning: $baserule has a name longer than 22 chars: $f\n"; - } - # Check our convention for rule length - if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && defined $description{$baserule}->{$lang} && length $description{$baserule}->{$lang} > 50 ) { - print "warning: $baserule has a description longer than 50 chars: $f\n"; - } - - # lang rule trumps normal rule - $f = $rulefile{$rule} || $rulefile{$baserule}; - # if the rule exists, and the language/rule description doesn't exist ... - if ( defined $f && !defined $description{$baserule}->{$lang} ) - { - print "warning: $baserule exists, ",( $lang ne '' ? "lang $lang, " : "" ),"but has no description: $f\n"; - $warned .= ' lamedesc'; - } - - - $f = $scorefile{$rule}; - if (defined $f && !defined ($rulefile{$rule}) - && !defined ($rulefile{$baserule})) - { - print "warning: $baserule has score, but no rule: $f\n"; - $warned .= ' lamescore'; - } - - my $r = $possible_rename_matches{$rule}; - if ($warned ne '' && defined $r) { - my @matches = split (' ', $possible_renames{$r}); - if (scalar @matches != 0) { - my $text = ''; - - # now try and figure out "nearby" rules with no description/score - foreach my $baser (@matches) { - my $blang; - if ($descfile{$rule} =~ /text_(\S\S)\./) { - $blang = $1; - } - my $r = concat_rule_lang ($baser, $blang); - #warn "$r $descfile{$r} $descfile{$baser}"; - next if ($warned =~ /lamedesc/ && (defined $descfile{$r})); - next if ($warned =~ /lamescore/ && (defined $scorefile{$r})); - $text .= " $baser"; - } - - if ($text ne '') { - print "warning: (possible renamed rule? $text)\n"; - } - } - } - } - } -} - Index: mass-check =================================================================== --- mass-check (revision 10703) +++ mass-check (working copy) @@ -16,97 +16,151 @@ # limitations under the License. # -sub usage { - die < - -j=jobs specify the number of processes to run simultaneously - --net turn on network checks! - --mid report Message-ID from each message - --debug report debugging information - --progress show progress updates during check - --rewrite=OUT save rewritten message to OUT (default is /tmp/out) - --showdots print a dot for each scanned message - --rules=RE Only test rules matching the given regexp RE - --restart=N restart all of the children after processing N messages - --deencap=RE Extract SpamAssassin-encapsulated spam mails only if they - were encapsulated by servers matching the regexp RE - (default = extract all SpamAssassin-encapsulated mails) - - log options - -o write all logs to stdout - --loghits log the text hit for patterns (useful for debugging) - --loguris log the URIs found - --hamlog=log use as ham log ('ham.log' is default) - --spamlog=log use as spam log ('spam.log' is default) - - message selection options - -n no date sorting or spam/ham interleaving - --after=N only test mails received after time_t N (negative values - are an offset from current time, e.g. -86400 = last day) - or after date as parsed by Time::ParseDate (e.g. '-6 months') - --before=N same as --after, except received times are before time_t N - --all don't skip big messages - --head=N only check first N ham and N spam (N messages if -n used) - --tail=N only check last N ham and N spam (N messages if -n used) - - simple target options (implies -o and no ham/spam classification) - --dir subsequent targets are directories - --file subsequent targets are files in RFC 822 format - --mbox subsequent targets are mbox files - --mbx subsequent targets are mbx files - - Just left over functions we should remove at some point: - --bayes report score from Bayesian classifier - - non-option arguments are used as target names (mail files and folders), - the target format is: :: - is "spam" or "ham" - is "dir", "file", "mbx", or "mbox" - is a file or directory name. globbing of ~ and * is supported +=head1 NAME -ENDOFUSAGE -} +mass-check - Generates SpamAssassin scores and results for large +amounts of mail +=head1 SYNOPSIS + + mass-check [options] class:format:location ... + mass-check [options] {--dir | --file | --mbox} target ... + mass-check [options] -f file + + Options: + -f=file read list of targets from + -j=jobs specify the number of processes to run simultaneously + --net turn on network checks! + --mid report Message-ID from each message + --debug report debugging information + --progress show progress updates during check + --rewrite=OUT save rewritten message to OUT (default is /tmp/out) + --showdots print a dot for each scanned message + --rules=RE Only test rules matching the given regexp RE + --restart=N restart all of the children after processing N messages + + SpamAssassin options + -c=dir set configuration/rules directory + -p=file set user preferences file (default: none) + -s=dir set site rules configuration directory + -u=dir set user-state directory + --dist assumes the script is being run from the masses/ dir of + the unpacked tarball, and makes appropriate guesses for + -p and -c + --deencap=RE Extract SpamAssassin-encapsulated spam mails only if they + were encapsulated by servers matching the regexp RE + (default = extract all SpamAssassin-encapsulated mails) + + log options + -o write all logs to stdout + --loghits log the text hit for patterns (useful for debugging) + --loguris log the URIs found + --log=file log to (masses.log is default) + + message selection options + -n no date sorting or spam/ham interleaving + --after=N only test mails received after time_t N (negative values + are an offset from current time, e.g. -86400 = last day) + or after date as parsed by Time::ParseDate (e.g. '-6 months') + --before=N same as --after, except received times are before time_t N + --all don't skip big messages + --head=N only check first N ham and N spam (N messages if -n used) + --tail=N only check last N ham and N spam (N messages if -n used) + + simple target options (implies -o and no ham/spam classification) + --dir subsequent targets are directories + --file subsequent targets are files in RFC 822 format + --mbox subsequent targets are mbox files + --mbx subsequent targets are mbx files + + Just left over functions we should remove at some point: + --bayes report score from Bayesian classifier + --hamlog=log use as ham log ('ham.log' is default) + --spamlog=log use as spam log ('spam.log' is default) + +=head1 DESCRIPTION + +B is designed to assist with rule development and +generation of SpamAssassin scored. It reads in mail from the +location(s) specified on the command line (in the first form above), +given in the form I, where I is either +"spam" or "ham" (non-spam), I is one of "dir" (Maildirs, MH, +etc), "file", "mbox" (mboxes can be gzipped) or "mbx". + +B will analyze each message using SpamAssassin and +generate one-line of output per message, (by default to masses.log) in +the following format: + + {s|h} {s|h} score filename tests-hit + +The first field is the message's class as given on the command line +(ham or spam). The second is the message's class as determined by +SpamAssassin. The third is the message's score, as determined by +SpamAssassin. The fourth field contains the message's filename; for +mboxes, this contains the filename and the byte offset from the +beginning of the file separated by a period. The last field contains a +list of all the tests the message hit separated by commas. + +If you want to run this on the currently installed version of +SpamAssassin's rules for sitewide use, make sure your user_prefs file +contains no rules. + +=head1 BUGS + +Please report bugs to http://bugzilla.spamassassin.org/ + +=head1 SEE ALSO + +L, L, L, +L + +=cut + ########################################################################### -use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all $opt_bayes - $opt_debug $opt_format $opt_hamlog $opt_head $opt_loghits - $opt_mid $opt_mh $opt_ms $opt_net $opt_nosort $opt_progress - $opt_showdots $opt_spamlog $opt_tail $opt_rules $opt_restart - $opt_loguris $opt_after $opt_before $opt_rewrite $opt_deencap); +use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all + $opt_bayes $opt_before $opt_debug $opt_dist $opt_format + $opt_hamlog $opt_head $opt_log $opt_loghits $opt_mid + $opt_mh $opt_ms $opt_net $opt_nosort $opt_p $opt_progress + $opt_s $opt_showdots $opt_spamlog $opt_tail $opt_rules + $opt_restart $opt_loguris $opt_after $opt_rewrite $opt_u + $opt_deencap); use FindBin; use lib "$FindBin::Bin/../lib"; eval "use bytes"; use Mail::SpamAssassin::ArchiveIterator; use Mail::SpamAssassin; -use Getopt::Long; +use Getopt::Long qw(:config bundling auto_help); use POSIX qw(strftime); use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; }; +use strict; # Why wasn't this on? use Config; # default settings -$opt_c = "$FindBin::Bin/../rules"; -$opt_p = "$FindBin::Bin/spamassassin"; + $opt_j = 1; $opt_net = 0; -$opt_hamlog = "ham.log"; -$opt_spamlog = "spam.log"; +$opt_log = "masses.log"; -GetOptions("c=s", "p=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug", - "hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net", - "progress", "rewrite:s", "showdots", "spamlog=s", "tail=i", - "rules=s", "restart=i", "after=s", "before=s", "loguris", "deencap=s", +GetOptions("c=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug", + "deencap=s", "dist!", "hamlog=s", "head=i", "log=s", + "loghits", "mh", "mid", "ms", "net", "p=s", "progress", + "rewrite:s", "s=s", "showdots", "spamlog=s", "tail=i", + "rules=s", "restart=i", "u=s", "after=s", "loguris", "dir" => sub { $opt_format = "dir"; }, - "file" => sub { $opt_format = "file"; }, + "file" => sub {$opt_format = "file"; }, "mbox" => sub { $opt_format = "mbox"; }, "mbx" => sub { $opt_format = "mbx"; }, '<>' => \&target) or usage(); +if ($opt_hamlog || $opt_spamlog) { # Old style logging + $opt_hamlog ||= "ham.log"; + $opt_spamlog ||= "spam.log"; +} + +my @targets; + if ($opt_f) { open(F, $opt_f) || die $!; push(@targets, map { chomp; $_ } ); @@ -115,45 +169,89 @@ if (scalar @targets == 0) { usage(); } -#if ($opt_ms) { -#find_missed($opt_spamlog); -#} -#elsif ($opt_mh) { -#find_missed($opt_hamlog); -#} +# Auto-detect --dist option +if (!defined $opt_dist) { + if (-f "$FindBin::Bin/../spamassassin.raw") { + warn "Automatically using --dist. Assuming you are running from the unpacked tarball. Use --no-dist to override."; + $opt_dist = 1; + } +} -$spamtest = new Mail::SpamAssassin ({ - 'debug' => $opt_debug, - 'rules_filename' => $opt_c, - 'userprefs_filename' => "$opt_p/user_prefs", - 'site_rules_filename' => "$opt_p/local.cf", - 'userstate_dir' => "$opt_p", - 'save_pattern_hits' => $opt_loghits, - 'dont_copy_prefs' => 1, - 'local_tests_only' => $opt_net ? 0 : 1, - 'only_these_rules' => $opt_rules, - 'ignore_safety_expire_timeout' => 1, - PREFIX => '', - DEF_RULES_DIR => $opt_c, - LOCAL_RULES_DIR => '', -}); +my $local_rules_dir; +if ($opt_dist) { # Set defaults + $opt_c ||= "$FindBin::Bin/../rules"; + $opt_p ||= "$FindBin::Bin/mass-check.cf"; + $opt_u ||= "$FindBin::Bin/spamassassin"; + $opt_s ||= "$FindBin::Bin/spamassassin"; + $local_rules_dir = ''; +} +else { + if(!$opt_u) { + # Assuming this is OK, since mass-check isnt supported on windows, is it? + # Also, should there be some check to make sure that previous mass-check stuff isn't in there? + # AFAICT, there isn't otherwise.... + if ( -d "${ENV{HOME}}/.spamassassin" ) { + $opt_u = "${ENV{HOME}}/.spamassassin/mass-check"; + mkdir $opt_u, 0700 if (! -d $opt_u); + } + } + +# Leave the rest to SA, we'll get it afterwards + +} + +my $spamtest = new Mail::SpamAssassin ({ + 'debug' => $opt_debug, + 'rules_filename' => $opt_c, + 'userprefs_filename' => $opt_p, + 'site_rules_filename' => $opt_s, + 'userstate_dir' => $opt_u, + 'save_pattern_hits' => $opt_loghits, + 'dont_copy_prefs' => 1, + 'local_tests_only' => $opt_net ? 0 : 1, + 'only_these_rules' => $opt_rules, + 'ignore_safety_expire_timeout' => 1, + DEF_RULES_DIR => $opt_c, + LOCAL_RULES_DIR => $local_rules_dir, + }); + $spamtest->compile_now(1); -$spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf"); +if ($opt_dist) { + $spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf"); +} my $who = `id -un 2>/dev/null`; chomp $who; my $where = `uname -n 2>/dev/null`; chomp $where; my $when = `date -u`; chomp $when; -my $revision = "unknown"; -if (open(TESTING, "$opt_c/70_testing.cf")) { - chomp($revision = ); - $revision =~ s/.*\$Rev:\s*(\S+).*/$1/; - close(TESTING); +my $revision; + +if ($opt_dist) { + my $rev = "unknown"; + if (open(TESTING, "$opt_c/70_testing.cf")) { + chomp($rev = ); + $rev =~ s/.*\$Rev:\s*(\S+).*/$1/; + close(TESTING); + } + $revision = "SVN revision: $rev"; } +else { + $revision = "Local"; +} + my $log_header = "# mass-check results from $who\@$where, on $when\n" . "# M:SA version ".$spamtest->Version()."\n" . - "# SVN revision: $revision\n" . + "# $revision\n" . "# Perl version: $] on $Config{archname}\n"; + +if (!$opt_dist) { + my @paths = ( $spamtest->{rules_filename}, $spamtest->{site_rules_filename}, $spamtest->{userprefs_filename} ); + $log_header .= "# Using configuration:\n"; + foreach my $file (@paths) { + $log_header .= "# $file\n"; + } +} + my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || `hostname` || 'localhost'; chomp $host; @@ -222,7 +320,7 @@ autoflush STDOUT 1; print STDOUT $log_header; } - else { + elsif ($opt_hamlog || $opt_spamlog) { open(HAM, "> $opt_hamlog"); open(SPAM, "> $opt_spamlog"); autoflush HAM 1; @@ -230,6 +328,11 @@ print HAM $log_header; print SPAM $log_header; } + else { + open(OUT, "> $opt_log"); + autoflush OUT 1; + print OUT $log_header; + } $init_results = 1; } @@ -239,25 +342,36 @@ # don't open results files until we get here to avoid overwriting files &init_results if !$init_results; - if ($class eq "s") { - if ($opt_o) { print STDOUT $result; } else { print SPAM $result; } - $spam_count++; + if ($opt_o) { + print STDOUT $result; } - elsif ($class eq "h") { - if ($opt_o) { print STDOUT $result; } else { print HAM $result; } - $ham_count++; + elsif ($opt_spamlog || $opt_hamlog) { + if ($class eq "s") { + print SPAM $result; + } else { + print HAM $result; + } } + else { + print OUT $result; + } $total_count++; #warn ">> result: $total_count $class $time\n"; if ($opt_progress) { + if ($class eq "s") { + $spam_count++; + } + else { + $ham_count++; + } progress($time); } } sub wanted { - my (undef, $id, $time, $dataref) = @_; + my ($class, $id, $time, $dataref) = @_; my $out; my $ma = $spamtest->parse($dataref, 1); @@ -308,18 +422,22 @@ push(@extra, "mid=$mid"); } - my $yorn; + my $result; my $score; my $tests; my $extra; if ($opt_loguris) { - $yorn = '.'; + $result = '.'; $score = 0; $tests = join(" ", sort @uris); $extra = ''; } else { - $yorn = $status->is_spam() ? 'Y' : '.'; + if ($status->is_spam()) { + $result = "s"; + } else { + $result = "h"; + } $score = $status->get_score(); $tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit()))); $extra = join(",", @extra); @@ -333,7 +451,7 @@ $id =~ s/\s/_/g; - $out .= sprintf("%s %2d %s %s %s\n", $yorn, $score, $id, $tests, $extra); + $out .= sprintf("%s %s %05.2f %s %s %s\n", $class, $result, $score, $id, $tests, $extra); if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) { $out .= logkilled($ma, $id, "possible virus"); Index: mk-baseline-results =================================================================== --- mk-baseline-results (revision 10703) +++ mk-baseline-results (working copy) @@ -10,7 +10,7 @@ echo "Classification success on test corpora, at default threshold:" echo -./logs-to-c --spam=spam-validate.log --nonspam=nonspam-validate.log --threshold 5 --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d' +./fp-fn-statistics --logfile=masses-validate.log --threshold 5 --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d' echo echo "Results on test corpora at various alternative thresholds:" @@ -18,7 +18,7 @@ # list a wide range of thresholds, so that we can make graphs later ;) for thresh in -4 -3 -2 -1 0 1 2 3 4 4.5 5.5 6 6.5 7 8 9 10 12 15 17 20 ; do - ./logs-to-c --spam=spam-validate.log --nonspam=nonspam-validate.log --threshold $thresh --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d' + ./logs-to-c --logfile=masses-validate.log --threshold $thresh --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d' echo done Index: README =================================================================== --- README (revision 10703) +++ README (working copy) @@ -52,11 +52,11 @@ This script is used to perform "mass checks" of a set of mailboxes, Cyrus folders, and/or MH mail spools. It generates summary lines like this: - Y 7 /home/jm/Mail/Sapm/1382 SUBJ_ALL_CAPS,SUPERLONG_LINE,SUBJ_FULL_OF_8BITS + s s 07.22 /home/jm/Mail/Sapm/1382 SUBJ_ALL_CAPS,SUPERLONG_LINE,SUBJ_FULL_OF_8BITS or for mailboxes, - . 1 /path/to/mbox:<5.1.0.14.2.20011004073932.05f4fd28@localhost> TRACKER_ID,BALANCE_FOR_LONG + h h 01.32 /path/to/mbox:<5.1.0.14.2.20011004073932.05f4fd28@localhost> TRACKER_ID,BALANCE_FOR_LONG listing the path to the message or its message ID, its score, and the tests that triggered on that mail. @@ -65,23 +65,24 @@ get good hits with few false positives, etc., and re-score the tests to optimise the ratio. - This script relies on the spamassassin distribution directory living in "..". + If given the --dist option, this script relies on the spamassassin + distribution directory living in "..". If this script is not in the + distribution directory, it will generate logs based on the site-wide + rules, as well as personal rules. logs-to-c : - Takes the "spam.log" and "nonspam.log" files and converts them into C - source files and simplified data files for use by the C score optimization - algorithm. (Called by "make" when you build the perceptron, so generally - you won't need to run it yourself.) + Takes the "masses.log" file and converts them into C source files + and simplified data files for use by the C score optimization + algorithm. (Called by "make" when you build the perceptron, so + generally you won't need to run it yourself.) - hit-frequencies : Analyses the log files and computes how often each test hits, overall, for spam mails and for non-spam. - mk-baseline-results : Compute results for the baseline scores (read from ../rules/*). If you @@ -91,7 +92,6 @@ It will output statistics on the current ruleset to ../rules/STATISTICS.txt, suitable for a release build of SpamAssassin. - perceptron.c : Perceptron learner by Henry Stern. See "README.perceptron" for details. Index: fp-fn-statistics =================================================================== --- fp-fn-statistics (revision 10703) +++ fp-fn-statistics (working copy) @@ -1,3 +1,146 @@ -#!/bin/sh +#!/usr/bin/perl -w +# +# <@LICENSE> +# Copyright 2004 Apache Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +use FindBin; +use lib "$FindBin::Bin/../lib"; +use Mail::SpamAssassin::Masses; +use Getopt::Long; +use strict; +use warnings; -exec ./logs-to-c --count $* +use vars qw{$opt_c $opt_l $opt_s $opt_h $opt_t $opt_lambda}; + +GetOptions("c|cffile=s@" => \$opt_c, "l|logfile=s" => \$opt_l, +"s|scoreset=i" => \$opt_s, "h|help" => \$opt_h, "t|threshold" => +\$opt_t, "lambda" => \$opt_lambda); + + +# Need to add rule for scores file +sub usage { + die "logs-to-c [-c rules dir] [-l log file] [-s SS] [-t] + + -c,--cffile p use p as rules directory + -s,--scoreset SS use scoreset SS + -l,--log log read logs from log instead of from masses.log + -t,--threshold n use a threshold of n (default: 5) + --lambda=n use a lambda value of n (default: 5) +"; +} + + +usage() if $opt_h; + +$opt_c ||= "$FindBin::Bin/../rules"; +$opt_t = (defined($opt_t) ? $opt_t : 5); +$opt_s ||= 0; #| +$opt_l ||= "masses.log"; +$opt_lambda ||= 5; + +print "$opt_c used for config"; + +my $nybias = 10; + + +my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c, + scoreset => $opt_s, # ,, + logfile => $opt_l}); + +$masses->readlogs(); + +my $logs = $masses->get_logs(); + +my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore) = (0,0,0,0,0,0,0,0); + +my $num_spam = $masses->get_num_spam(); +my $num_ham = $masses->get_num_ham(); +my $num_logs = $num_spam + $num_ham; + +my $count = 0; + +my $score; + +foreach my $log (@$logs) { + + $score = 0; + foreach my $test (@{$log->{tests_hit}}) { + + next if ($test->{issubrule}); + next if (!$test->{score}); + + $score += $test->{score}; + + } + + if ($score >= $opt_t) { + if ($log->{isspam}) { + $ga_yy++; + $yyscore += $score; + } + else { + $ga_ny++; + $nyscore += $score; + } + } else { + if ($log->{isspam}) { + $ga_yn++; + $ynscore += $score; + } + else { + $ga_nn++; + $nnscore += $score; + } + } +} + +$nybias = $nybias * ($num_spam / $num_ham); + +my $fprate = ($ga_ny / $num_logs) * 100.0; +my $fnrate = ($ga_yn / $num_logs) * 100.0; + +printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_t); +printf "# Correctly non-spam: %6d %4.2f%% (%4.2f%% of non-spam corpus)\n", $ga_nn, + ($ga_nn / $num_logs) * 100.0, ($ga_nn / $num_ham) * 100.0; +printf "# Correctly spam: %6d %4.2f%% (%4.2f%% of spam corpus)\n" , $ga_yy, + ($ga_yy / $num_logs) * 100.0, ($ga_yy / $num_spam) * 100.0; +printf "# False positives: %6d %4.2f%% (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny, + $fprate, ($ga_ny / $num_ham) * 100.0, $nyscore*$nybias; +printf "# False negatives: %6d %4.2f%% (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn, + $fnrate, ($ga_yn / $num_spam) * 100.0, $ynscore; + +# convert to the TCR metrics used in the published lit +my $nspamspam = $ga_yy; +my $nspamlegit = $ga_yn; +my $nlegitspam = $ga_ny; +my $nlegitlegit = $ga_yn; +my $nlegit = $num_ham; +my $nspam = $num_spam; + +my $werr = ($opt_lambda * $nlegitspam + $nspamlegit) + / ($opt_lambda * $nlegit + $nspam); + +my $werr_base = $nspam + / ($opt_lambda * $nlegit + $nspam); + +$werr ||= 0.000001; # avoid / by 0 +my $tcr = $werr_base / $werr; + +my $sr = ($nspamspam / $nspam) * 100.0; +my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0; +printf "# TCR: %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%% FP: %3.2f%% FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate; + Index: extract-message-from-mbox =================================================================== --- extract-message-from-mbox (revision 10703) +++ extract-message-from-mbox (working copy) @@ -68,7 +68,7 @@ sub masscheck { while () { - my $mail = (split(/\s+/, $_))[2]; + my $mail = (split(/\s+/, $_))[3]; $mail =~ tr/_/ /; if ($mail =~ /^(.*)\.(\d+)$/) { extract($1, $2); Index: logs-to-c =================================================================== --- logs-to-c (revision 10703) +++ logs-to-c (working copy) @@ -16,257 +16,243 @@ # limitations under the License. # -use Getopt::Long; -use vars qw($opt_cffile $opt_count $opt_lambda $opt_threshold - $opt_spam $opt_nonspam); +=head1 NAME -GetOptions("cffile=s", "count", "lambda=f", "threshold=f", "spam=s", "nonspam=s", "scoreset=i"); -my $argcffile = $opt_cffile; +logs-to-c - Convert a mass-check log into perceptron format -my $justcount = 0; -if ($opt_count) { $justcount = 1; } +=head1 SYNOPSIS -my $threshold = 5; -if (defined $opt_threshold) { $threshold = $opt_threshold; } +logs-to-c [options] -$opt_spam ||= 'spam.log'; -$opt_nonspam ||= 'ham.log'; -$opt_scoreset = 0 if ( !defined $opt_scoreset ); + Options: + -c,--cffile=path Use path as the rules directory + -s,--scoreset=n Use scoreset n + -l,--logfile=file Read in file instead of masses.log + -o,--outputdir Put output in the specified dir (default tmp/) -my $nybias = 10; +=head1 DESCRIPTION -# lambda value for TCR equation, indicating the "cost" of recovering -# from an FP. The values are: 1 = tagged only, 9 = mailed back to -# sender asking for token (TMDA style), 999 = deleted outright. -# We (SpamAssassin) use a default of 5, representing "moved to -# infrequently-read folder". +B will read the mass-check log F or as +specified by the B<--logfile> option, and convert it into the format +needed by the perceptron. This is a format that is simple for the +perceptron to parse, but is not very readable to humans. -my $lambda = 5; -if ($opt_lambda) { $lambda = $opt_lambda; } +By default, output will be put in the directory ./tmp/ unless another +directory is specified by the B<--outputdir> option. (Note: at the +current time, this must be /tmp/ in order for the perceptron to +compile properly.) -my %is_spam = (); -my %tests_hit = (); -my %mutable_tests = (); +=head1 BUGS -use vars qw(%rules %allrules); +Please report bugs to http://bugzilla.spamassassin.org/ -readscores(); +=head1 SEE ALSO -print "Reading per-message hit stat logs and scores...\n"; -my ($num_tests, $num_spam, $num_nonspam); -my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore); +L, L, L -readlogs(); -read_ranges(); +=cut -if ($justcount) { - $nybias = $nybias*($num_spam / $num_nonspam); - evaluate(); -} else { - print "Writing logs and current scores as C code...\n"; - writescores_c(); +use FindBin; +use lib "$FindBin::Bin/../lib"; +use Mail::SpamAssassin::Masses; +use Getopt::Long qw(:config bundling auto_help); +use strict; +use warnings; + +use vars qw{$opt_c $opt_l $opt_s $opt_o}; + +GetOptions("c|cffile=s@" => \$opt_c, + "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode) + "l|logfile=s" => \$opt_l, + "o|output=s" => \$opt_o); + + +$opt_o ||= "./tmp/"; +if (!-d $opt_o) { + mkdir $opt_o, 0777 or die "Can't mkdir $opt_o"; } -exit 0; +if (!$opt_c || !scalar(@$opt_c)) { + # Try to read this in from the log, if possible + open IN, $opt_l or die "Can't open $opt_l: $!"; + my $files = 0; # are we in the files section? + while() { + if (!$files) { + if (/^\# SVN revision:/) { + $opt_c = [ "$FindBin::Bin/../rules" ]; + last; + } elsif (/^\# Using configuration:$/) { + $files = 1; + } + } elsif (/^\#\s+(.*)\s*$/) { + push (@$opt_c, $1); + } else { + # All done! + last; + } + } -sub readlogs { - my $count = 0; - $num_spam = $num_nonspam = 0; + foreach my $file (@$opt_c) { + die "Can't read $file" unless -r $file; + } +} - if ($justcount) { - $ga_yy = $ga_ny = $ga_yn = $ga_nn = 0; - $yyscore = $ynscore = $nyscore = $nnscore = 0.0; - } +# ignore rules that are subrules -- we don't generate scores for them... - foreach my $file ($opt_spam, $opt_nonspam) { - open (IN, "<$file"); +# Note: this will cause a difference over the old logs-to-c since rank +# is dependent on the frequencies of all rules, not just non-subrules - while () { - next if /^\#/; - next if /^$/; - if($_ !~ /^.\s+([-\d]+)\s+\S+\s*/) { warn "bad line: $_"; next; } - my $hits = $1; -#my $foo = $_; - $_ = $'; s/(?:bayes|time)=\S+//; s/,,+/,/g; s/^\s+//; s/\s+$//; +my $greprules = sub { return 0 if $_[1]->{issubrule}; return 1; }; - my $score = 0; - my @tests = (); - foreach my $tst (split (/,/, $_)) { - next if ($tst eq ''); - if (!defined $scores{$tst}) { - #warn "unknown test in $file, ignored: $tst\n"; - next; - } +$opt_s ||= 0; # | - # Make sure to skip any subrules! - next if ( $allrules{$tst}->{issubrule} ); +my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c, + scoreset => $opt_s, # ,, + logfile => $opt_l, + greprules => $greprules }); - if ($justcount) { - $score += $scores{$tst}; - } else { - push (@tests, $tst); - } - } +$masses->readlogs(); +$masses->do_score_ranges(); - if (!$justcount) { - $tests_hit{$count} = \@tests; - } +my $rules = $masses->get_rules_array(); +my $logs = $masses->get_logs(); - if ($file eq $opt_spam) { - $num_spam++; - if ($justcount) { - if ($score >= $threshold) { - $ga_yy++; $yyscore += $score; - } else { - $ga_yn++; $ynscore += $score; - } - } else { - $is_spam{$count} = 1; - } - } else { - $num_nonspam++; - if ($justcount) { - if ($score >= $threshold) { -#print "$score -- $foo"; - $ga_ny++; $nyscore += $score; - } else { - $ga_nn++; $nnscore += $score; - } - } else { - $is_spam{$count} = 0; - } - } - $count++; - } - close IN; - } - $num_tests = $count; -} +my @index_to_rule; +my $num_spam = $masses->get_num_spam(); +my $num_ham = $masses->get_num_ham(); +# This is misleading -- num_tests is really num_msgs +my $num_tests = $num_spam + $num_ham; -sub readscores { - if (!defined $argcffile) { $argcffile = "../rules"; } - print "Reading scores from \"$argcffile\"...\n"; - system ("./parse-rules-for-masses -d \"$argcffile\" -s $opt_scoreset") and die; - require "./tmp/rules.pl"; - %allrules = %rules; # ensure it stays global -} +# Write logs and scores as C code +writescores_c(); +writetests_c(); + + sub writescores_c { - my $output = ''; - my $size = 0; + my $mutable = 0; - my $i; + my $output = ''; + my $count = 0; + my $score = 0; - # jm: now, score-ranges-from-freqs has tflags to work from, so - # it will always list all mutable tests. + foreach my $rule (sort {($b->{ismutable} <=> $a->{ismutable}) || + ($a->{name} cmp $b->{name}) } @$rules) { - @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) || - ($mutable_tests{$b} <=> $mutable_tests{$a}) || - ($a cmp $b)} (keys %scores); - my $max_hits_per_msg = 0; - for ($file = 0; $file < $num_tests; $file++) { - my(@hits) = - grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (@{$tests_hit{$file}}); - if ((scalar(@hits)+1) > $max_hits_per_msg) { - $max_hits_per_msg = scalar(@hits)+1; - } - } + $score = $rule->{score}; - for ($i = 0; $i <= $#index_to_rule; $i++) { - my $name = $index_to_rule[$i]; - $rule_to_index{$name} = $i; + # ignored rules (i.e. no scores) + next unless $score; - if ($ignored_rule{$name}) { next; } + # also ignore rules with score range 0 + next if (!$rule->{range_lo} && !$rule->{range_hi}); - if ($mutable_tests{$name} == 0) { - $range_lo{$name} = $range_hi{$name} = $scores{$name}; - } else { + # Set an index + $rule->{index} = $count; + $index_to_rule[$count] = $rule; # add the reference to the array + + if ($rule->{ismutable}) { $mutable++; - if ($range_lo{$name} > $range_hi{$name}) { - ($range_lo{$name},$range_hi{$name}) = - ($range_hi{$name},$range_lo{$name}); + if ($score > $rule->{range_hi}) { + $score = $rule->{range_hi} - 0.001; + } elsif ($score < $rule->{range_lo}) { + $score = $rule->{range_lo} + 0.001; } - #$range_lo{$name} ||= 0.1; - #$range_hi{$name} ||= 1.5; } + # These should all be set properly if not mutable + # score = range_lo = range_hi + else { + warn "hi != lo for " . $rule->{name} . "!" if $rule->{range_lo} != $rule->{range_hi}; + $score = $rule->{range_hi} = $rule->{range_lo}; + } - $output .= ".".$i."\n". - "n".$name."\n". - "b".$scores{$name}."\n". - "m".$mutable_tests{$name}."\n". - "l".$range_lo{$name}."\n". - "h".$range_hi{$name}."\n"; - $size++; + $output .= "." . $count . "\n" . + "n" . $rule->{name} . "\n" . + "b" . $score . "\n" . + "m" . $rule->{ismutable} . "\n" . + "l" . $rule->{range_lo} . "\n" . + "h" . $rule->{range_hi} . "\n"; + + $count++; + } + # Output this - open (DAT, ">tmp/scores.data"); - print DAT "N$size\n", "M$mutable\n", # informational only - $output; + open (DAT, ">$opt_o/scores.data"); + print DAT "N$count\n", "M$mutable\n"; # informational + print DAT $output; close DAT; - open (OUT, ">tmp/scores.h"); - print OUT " + open (OUT, ">$opt_o/scores.h"); + print OUT < #include #include - -int num_scores = $size; + +int num_scores = $count; int num_mutable = $mutable; -unsigned char is_mutable[$size]; -double range_lo[$size]; -double range_hi[$size]; -double bestscores[$size]; -char *score_names[$size]; -double tmp_scores[$size][2]; +unsigned char is_mutable[$count]; +double range_lo[$count]; +double range_hi[$count]; +double bestscores[$count]; +char *score_names[$count]; +double tmp_scores[$count][2]; unsigned char ny_hit[$mutable]; unsigned char yn_hit[$mutable]; - + double lookup[$mutable]; - + /* readscores() is defined in tests.h */ +EOF -"; close OUT; - writetests_c($max_hits_per_msg); # make sure $rule_to_index is around } + sub writetests_c { - my $max_hits_per_msg = $_[0]; - my(%uniq_files) = (); - my(%count_keys) = (); - my(%file_key) = (); + my $max_hits_per_msg = 0; + my @goodtests; + my %uniq_logs; + my $uniq_key; - my $file; + my $i = 0; - for ($file = 0; $file < $num_tests; $file++) - { - my $uniq_key = $is_spam{$file} . " "; + # This will "compress" the logs so that one log entry can have a + # "count" of n indicating it reprents n similar messages - my(@good_tests) = - grep {length($_) && (! $ignored_rule{$_}) && - (defined($rule_to_index{$_}))} (@{ $tests_hit{$file} }); + foreach my $log (@$logs) { - @good_tests = sort {$a <=> $b} (map {$rule_to_index{$_}} (@good_tests)); + (@goodtests) = grep {exists($_->{index})} (@{$log->{tests_hit}}); + @goodtests = sort {$a <=> $b} map {$_->{index}} @goodtests; - $uniq_key .= join(" ",@good_tests); + if($max_hits_per_msg < scalar(@goodtests)) { + $max_hits_per_msg = scalar(@goodtests); + } - if (exists($count_keys{$uniq_key})) { - $count_keys{$uniq_key}++; + $uniq_key = $log->{isspam} ? "s" : ""; + $uniq_key .= join(" ", @goodtests); + + + # The %count_keys hash's entries will be the log info for each unique log + # $log->{count} is increased to indicate similar logs + + if (exists($uniq_logs{$uniq_key})) { + $uniq_logs{$uniq_key}->{count}++; } else { - $count_keys{$uniq_key} = 1; - $file_key{$file} = $uniq_key; - $uniq_files{$file} = scalar(keys(%count_keys)) - 1; + $uniq_logs{$uniq_key} = $log; + $uniq_logs{$uniq_key}->{count} = 1; } + } - my $num_nondup = scalar(keys(%uniq_files)); + my $num_nondup = scalar(keys %uniq_logs); - open (TOP, ">tmp/tests.h"); - print TOP " + open TOP, ">$opt_o/tests.h"; + print TOP < #include #include @@ -274,7 +260,7 @@ int num_tests = $num_tests; int num_nondup = $num_nondup; int num_spam = $num_spam; -int num_nonspam = $num_nonspam; +int num_nonspam = $num_ham; int max_hits_per_msg = $max_hits_per_msg; unsigned char num_tests_hit[$num_nondup]; unsigned char is_spam[$num_nondup]; @@ -282,196 +268,77 @@ double scores[$num_nondup]; double tmp_total[$num_nondup]; int tests_count[$num_nondup]; +EOF -"; - $_ = join ('', ); - print TOP $_; + + print TOP join('', ); close TOP; - open (DAT, ">tmp/tests.data"); - foreach $file (sort {$a <=> $b} (keys %uniq_files)) { - print DAT ".".$uniq_files{$file}."\n"; + open (DAT, ">$opt_o/tests.data"); - my $out = ''; - $out .= "s".$is_spam{$file}."\n"; + my $out; + my $base_score; + my $num_tests_hit; - my $base_score = 0; - my $num_tests_hit = 0; - foreach my $test (@{$tests_hit{$file}}) { - if ($test eq '') { next; } + $i = 0; + foreach my $log (values %uniq_logs) { + $out = ''; + $base_score = $num_tests_hit = 0; - if ($ignored_rule{$test}) { - warn "ignored rule $test got a hit in $file!\n"; - next; + print DAT "." . $i . "\n"; + + $out .= "s" . ( ($log->{isspam})? 1 : 0 ) . "\n"; + + foreach my $test (@{$log->{tests_hit}}) { + if (!$test->{score}) { + # Don't really know why this happens, but the old logs-to-c + #did it too + + warn "ignored rule " . $test->{name} . " got a hit!"; + next; } - if (!defined $rule_to_index{$test}) { - warn "test with no C index: $test\n"; + if (!$test->{range_lo} && !$test->{range_hi}) { + # We ignored this rule next; } - if ($mutable_tests{$test}) { - $num_tests_hit++; - $out .= "t".$rule_to_index{$test}."\n"; - - if ($num_tests_hit >= $max_hits_per_msg) { - die "Need to increase \$max_hits_per_msg"; + # debugging... + if (!defined $test->{index}) { + warn "test with no index"; } - } else { - $base_score += $scores{$test}; - } - } - $out .= "b" . $base_score . "\n"; # score to add in for non-mutable tests - $out .= "c" . $count_keys{$file_key{$file}} . "\n"; + if ($test->{ismutable}) { + $num_tests_hit++; + $out .= "t".$test->{index}."\n"; - print DAT "n".$num_tests_hit."\n".$out; - } - close DAT; -} + if ($num_tests_hit >= $max_hits_per_msg) { + die "\$max_hits_per_msg not big enough!"; + } -sub read_ranges { - if (!-f 'tmp/ranges.data') { - system ("make tmp/ranges.data"); - } + } + else { + $base_score += $test->{score}; + } - # read ranges, and mutableness, from ranges.data. - open (IN, ") { - /^(\S+) (\S+) (\d+) (\S+)$/ or next; - my $t = $4; - $range_lo{$t} = $1+0; - $range_hi{$t} = $2+0; - my $mut = $3+0; - - if ($allrules{$t}->{issubrule}) { - $ignored_rule{$t} = 1; - $mutable_tests{$t} = 0; - next; } - if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) { - #warn "ignored rule: score and range == 0: $t\n"; - $ignored_rule{$t} = 1; - $mutable_tests{$t} = 0; - next; - } - $ignored_rule{$t} = 0; - $index_to_rule[$count] = $t; - $count++; + $out .= "b" . $base_score . "\n"; # score to add for non-mutable tests + $out .= "c" . $log->{count} . "\n"; # number of identical logs - if (!$mut) { - $mutable_tests{$t} = 0; - } elsif ($range_lo{$t} == $range_hi{$t}) { - $mutable_tests{$t} = 0; - } elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) { - $mutable_tests{$t} = 0; - } else { - $mutable_tests{$t} = 1; - } - unless ($mutable_tests{$t} || $scores{$t}) { - $ignored_rule{$t} = 1; - } - } - close IN; + print DAT "n" . $num_tests_hit . "\n" . $out; - # catch up on the ones missed; seems to be userconf or 0-hitters mostly. - foreach my $t (sort keys %allrules) { - next if (exists($range_lo{$t})); - if ($allrules{$t}->{issubrule}) { - $ignored_rule{$t} = 1; - $mutable_tests{$t} = 0; - next; - } - $ignored_rule{$t} = 0; - unless (exists($mutable_tests{$t}) && - ($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) { - $mutable_tests{$t} = 0; - } - unless ($mutable_tests{$t} || $scores{$t}) { - $ignored_rule{$t} = 1; - } - $index_to_rule[$count] = $t; - $count++; + $i++; } - foreach my $t (keys %range_lo) { - next if ($ignored_rule{$t}); - if ($mutable_tests{$t}) { - if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { - $scores{$t} = -1; - } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) && - ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { - $scores{$t} = -0.01; - } - if ($scores{$t} >= $range_hi{$t}) { - $scores{$t} = $range_hi{$t} - 0.001; - } elsif ($scores{$t} <= $range_lo{$t}) { - $scores{$t} = $range_lo{$t} + 0.001; - } - } else { - if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) { - next; - } elsif ($range_lo{$t} == $range_hi{$t}) { - $scores{$t} = $range_lo{$t}; - next; - } - if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { - $scores{$t} = -1; - } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) && - ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { - $scores{$t} = -0.01; - } - if ($scores{$t} > $range_hi{$t}) { - $scores{$t} = $range_hi{$t}; - } elsif ($scores{$t} < $range_lo{$t}) { - $scores{$t} = $range_lo{$t}; - } - } - } -} -sub evaluate { - my $fprate = ($ga_ny / $num_tests) * 100.0; - my $fnrate = ($ga_yn / $num_tests) * 100.0; + close DAT; - printf ("\n# SUMMARY for threshold %3.1f:\n", $threshold); - printf "# Correctly non-spam: %6d %4.2f%% (%4.2f%% of non-spam corpus)\n", $ga_nn, - ($ga_nn / $num_tests) * 100.0, ($ga_nn / $num_nonspam) * 100.0; - printf "# Correctly spam: %6d %4.2f%% (%4.2f%% of spam corpus)\n" , $ga_yy, - ($ga_yy / $num_tests) * 100.0, ($ga_yy / $num_spam) * 100.0; - printf "# False positives: %6d %4.2f%% (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny, - $fprate, ($ga_ny / $num_nonspam) * 100.0, $nyscore*$nybias; - printf "# False negatives: %6d %4.2f%% (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn, - $fnrate, ($ga_yn / $num_spam) * 100.0, $ynscore; - # convert to the TCR metrics used in the published lit - my $nspamspam = $ga_yy; - my $nspamlegit = $ga_yn; - my $nlegitspam = $ga_ny; - my $nlegitlegit = $ga_yn; - my $nlegit = $num_nonspam; - my $nspam = $num_spam; - - my $werr = ($lambda * $nlegitspam + $nspamlegit) - / ($lambda * $nlegit + $nspam); - - my $werr_base = $nspam - / ($lambda * $nlegit + $nspam); - - $werr ||= 0.000001; # avoid / by 0 - my $tcr = $werr_base / $werr; - - my $sr = ($nspamspam / $nspam) * 100.0; - my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0; - printf "# TCR: %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%% FP: %3.2f%% FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate; } -__DATA__ +__DATA__ void loadtests (void) { FILE *fin = fopen ("tmp/tests.data", "r"); char buf[256]; @@ -557,4 +424,3 @@ printf ("Read scores for %d tests.\n", num_scores); } - Index: score-ranges-from-freqs =================================================================== --- score-ranges-from-freqs (revision 10703) +++ score-ranges-from-freqs (working copy) @@ -1,251 +0,0 @@ -#!/usr/bin/perl -w -# -# <@LICENSE> -# Copyright 2004 Apache Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# (rough) graphic demo of this algorithm: -# 0.0 = -limit [......] 0 ........ limit -# 0.25 = -limit ..[..... 0 .]...... limit -# 0.5 = -limit ....[... 0 ...].... limit -# 0.75 = -limit ......[. 0 .....].. limit -# 1.0 = -limit ........ 0 [......] limit -my $sliding_window_limits = 4.8; # limits = [-$range, +$range] -my $sliding_window_size = 5.5; # scores have this range within limits - -# 0.0 = -limit [......] 0 ........ limit -# 0.25 = -limit ....[... 0 ]....... limit -# 0.5 = -limit ......[. 0 .]...... limit (note: tighter) -# 0.75 = -limit .......[ 0 ...].... limit -# 1.0 = -limit ........ 0 [......] limit -my $shrinking_window_lower_base = 0.00; -my $shrinking_window_lower_range = 1.00; # *ratio, added to above -my $shrinking_window_size_base = 1.00; -my $shrinking_window_size_range = 1.00; # *ratio, added to above - -my $use_sliding_window = 0; - -my $argcffile = shift @ARGV; -my $scoreset = shift @ARGV; -$scoreset = 0 if ( !defined $scoreset ); - -if (defined ($argcffile) && $argcffile eq '-test') { - # use this to debug the ranking -> score-range mapping: - for $rat (0.0, 0.25, 0.5, 0.75, 1.0) { - my ($lo, $hi); if ($use_sliding_window) { - ($lo, $hi) = sliding_window_ratio_to_range($rat); - } else { - ($lo, $hi) = shrinking_window_ratio_to_range($rat); - } - warn "test: $rat => [ $lo $hi ]\n"; - } exit; -} - -my %freq_spam = (); -my %freq_nonspam = (); - -my $num_spam; -my $num_nonspam; -my $num_total; - -my %mutable_tests = (); -my %ranking = (); -my %soratio = (); -my %is_nice = (); - -if (!defined $argcffile) { $argcffile = "../rules"; } -system ("./parse-rules-for-masses -d \"$argcffile\" -s $scoreset") and die; -if (-e "tmp/rules.pl") { - # Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem - require "./tmp/rules.pl"; -} -else { - die "parse-rules-for-masses had no error but no tmp/rules.pl!?!"; -} - -while (<>) { - /^\s*([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+\S+\s+(.+)\s*$/ or next; - - my $overall = $1+0; - my $spam = $2+0; - my $nonspam = $3+0; - my $soratio = $4+0; - my $ranking = $5+0; - my $test = $6; - - if ($test eq '(all messages)') { - $num_spam = $spam; - $num_nonspam = $nonspam; - $num_total = $spam+$nonspam; - next; - } - next if ($test eq '(all messages as %)'); - - if (!defined ($rules{$test})) { - warn "rule $test no longer exists; ignoring\n"; - next; - } - - $freq{$test} = $overall; - $freq_spam{$test} = $spam; - $freq_nonspam{$test} = $nonspam; - - my $tflags = $rules{$test}->{tflags}; $tflags ||= ''; - if ($tflags =~ /\buserconf\b/ || - ( ($scoreset % 2) == 0 && $tflags =~ /\bnet\b/ )) { - $mutable_tests{$test} = 0; - } else { - $mutable_tests{$test} = 1; - } - if ($tflags =~ m/\bnice\b/i) { - $is_nice{$test} = 1; - } else { - $is_nice{$test} = 0; - } - - if ($overall < 0.01) { # less than 0.01% of messages were hit - $mutable_tests{$test} = 0; - $soratio{$test} = 0.5; - $ranking{$test} = 0.0; - $rules{$test}->{score} = 0; # tvd - disable these rules automagically - - } else { - $soratio{$test} = $soratio; - $ranking{$test} = $ranking; - } -} - -if ( ! mkdir "tmp", 0755 ) { - warn "Couldn't create tmp directory!: $!\n"; -} - -open (OUT, ">tmp/ranges.data"); -foreach my $test (sort { $ranking{$b} <=> $ranking{$a} } keys %freq) { - if (!defined ($rules{$test})) { - warn "no rule $test"; - print OUT ("0 0 0 $test\n"); - next; - } - - my $overall = $freq{$test}; - my $spam = $freq_spam{$test}; - my $nonspam = $freq_nonspam{$test}; - my $soratio = $soratio{$test}; - my $ranking = $ranking{$test}; - my $mutable = $mutable_tests{$test}; - - if (!$mutable || $rules{$test}->{score} == 0) { # didn't look for score 0 - tvd - printf OUT ("%3.3f %3.3f 0 $test\n", - $rules{$test}->{score}, - $rules{$test}->{score}); - next; - } - - # 0.0 = best nice, 1.0 = best nonnice - if ($is_nice{$test}) { - $ranking = .5 - ($ranking / 2); - } else { - $ranking = .5 + ($ranking / 2); - } - - my ($lo, $hi); - if ($use_sliding_window) { - ($lo, $hi) = sliding_window_ratio_to_range($ranking); - } else { - ($lo, $hi) = shrinking_window_ratio_to_range($ranking); - } - - # tvd - my $tflags = $rules{$test}->{tflags}; $tflags ||= ''; - if ( $is_nice{$test} && ( $ranking < .5 ) ) { # proper nice rule - if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score # -5.4 - $lo *=1.8; - } - elsif ($soratio <= 0.05 && $nonspam > 0.5) { # let good rules be larger if they want to, -4.5 - $lo *= 1.5; - } - - $hi = ($soratio == 0) ? $lo : - ($soratio <= 0.005 ) ? $lo/1.1 : - ($soratio <= 0.010 && $nonspam > 0.2) ? $lo/2.0 : - ($soratio <= 0.025 && $nonspam > 1.5) ? $lo/10.0 : - 0; - - if ( $soratio >= 0.35 ) { # auto-disable bad rules - ($lo,$hi) = (0,0); - } - } - elsif ( !$is_nice{$test} && ( $ranking >= .5 ) ) { # proper spam rule - if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score - $hi *=1.8; - } - elsif ( $soratio >= 0.99 && $spam > 1.0 ) { - $hi *= 1.5; # let good rules be larger if they want to - } - - $lo = ($soratio == 1) ? $hi: - ($soratio >= 0.995 ) ? $hi/4.0 : - ($soratio >= 0.990 && $spam > 1.0) ? $hi/8.0 : - ($soratio >= 0.900 && $spam > 10.0) ? $hi/24.0 : - 0; - - if ( $soratio <= 0.65 ) { # auto-disable bad rules - ($lo,$hi) = (0,0); - } - } - else { # rule that has bad nice setting - ($lo,$hi) = (0,0); - } - $mutable = 0 if ( $hi == $lo ); - - printf OUT ("%3.1f %3.1f $mutable $test\n", $lo, $hi); -} -close OUT; -exit; - -sub sliding_window_ratio_to_range { - my $ratio = shift; - my $lo = -$sliding_window_limits + ($sliding_window_size * $ratio); - my $hi = +$sliding_window_limits - ($sliding_window_size * (1-$ratio)); - if ($lo > $hi) { # ??? - ($lo,$hi) = ($hi,$lo); - } - ($lo, $hi); -} - -sub shrinking_window_ratio_to_range { - my $ratio = shift; - my $is_nice = 0; - my $adjusted = ($ratio -.5) * 2; # adj [0,1] to [-1,1] - if ($adjusted < 0) { $is_nice = 1; $adjusted = -$adjusted; } - -#$adjusted /= 1.5 if ( $ratio < 0.95 && $ratio > 0.15 ); # tvd - - my $lower = $shrinking_window_lower_base - + ($shrinking_window_lower_range * $adjusted); - my $range = $shrinking_window_size_base - + ($shrinking_window_size_range * $adjusted); - my $lo = $lower; - my $hi = $lower + $range; - if ($is_nice) { - my $tmp = $hi; $hi = -$lo; $lo = -$tmp; - } - if ($lo > $hi) { # ??? - ($lo,$hi) = ($hi,$lo); - } - - ($lo, $hi); -} - Index: find-extremes =================================================================== --- find-extremes (revision 10703) +++ find-extremes (working copy) @@ -17,38 +17,144 @@ # limitations under the License. # -use Getopt::Std; -getopts("l:L:h"); +use FindBin; +use lib "$FindBin::Bin/../lib"; +use Mail::SpamAssassin::Masses; +use Getopt::Long qw(:config bundling auto_help); +use Pod::Usage; +use strict; +use warnings; + use vars qw { - $opt_h $opt_l $opt_L +$opt_c $opt_s $opt_l $opt_L $opt_inclang }; -sub usage { - die "find-extremes [-l LC] [-L LC] [spam log] [nonspam log] +GetOptions("c|cffile=s@" => \$opt_c, + "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode) + "l|logfile=s" => \$opt_l, + "L|language=s" => \$opt_L, + "include-language=s" => \$opt_inclang); - -l LC also print language specific rules for lang code LC (or 'all') - -L LC only print language specific rules for lang code LC (or 'all') - options -l and -L are mutually exclusive. - if either the spam or and nonspam logs are unspecified, the defaults - are \"spam.log\" and \"nonspam.log\" in the cwd. +my $lower = 1; +#$threshold = 5; +my $higher = 9; +my $min_expected = 2; # Should not be set to more than 5 or less than 2 -"; + +=head1 NAME + +find-extremes - Determine which rules are most likely to cause false positives/negatives. + +=head1 SYNOPSIS + +hit-frequencies [options] + + Options: + -c,--cffile=path Use path as the rules directory + -s,--scoreset=n Use scoreset n + -l,--logfile=file Read in file instead of masses.log + -L,--language=lc Only print language specific tests for specified lang code (try 'all') + --include-language=lc Also print language specific tests for specified lang code (try 'all') + +=head1 DESCRIPTION + +B will read the mass-check log F or the +log given by the B<--logfile> option. By default, B +will assume the proper values for B<--cffile> based on the header of +the masses.log. The output will include the following columns: + +=over 4 + +=item RULE + +=item CHISQUARE + +=item RATIO_FALSEPOS + +=item OVER_FALSEPOS + +=item FREQ_OVER + +=back + +=head1 BUGS + +This script may or may not work as designed - it probably needs some +tweaking, and I probably introduced a bug into it while re-writing for +the new Masses stuff. + +=head1 NOTES + +This script is poorly documented. Patches welcome. + +=cut + + +$opt_s = 0 unless defined $opt_s; + +my $ok_lang = lc ( $opt_inclang || $opt_L || ''); +$ok_lang = '.' if ($ok_lang eq 'all'); + +my $greprules = sub { + my ($name, $rule) = @_; + + return 0 if (($opt_L && !$rule->{lang}) || + ($rule->{lang} && + (!$ok_lang || $rule->{lang} !~ /^$ok_lang/i))); # Wrong language + + return 0 if ($rule->{tflags} =~ /\bnet\b/); + + return 1; + +}; + +$opt_l ||= "masses.log"; + +if (!$opt_c || !scalar(@$opt_c)) { + # Try to read this in from the log, if possible + open (IN, $opt_l) or die "Can't open $opt_l: $!"; + my $files = 0; # are we in the files section? + while() { + if (!$files) { + if (/^\# SVN revision:/) { + $opt_c = [ "$FindBin::Bin/../rules" ]; + last; + } elsif (/^\# Using configuration:$/) { + $files = 1; + } + } elsif (/^\#\s+(.*)\s*$/) { + push (@$opt_c, $1); + } else { + # All done! + last; + } + } + + foreach my $file (@$opt_c) { + die "Can't read $file" unless -r $file; + } } -usage() if($opt_h || ($opt_l && $opt_L)); +my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c, + scoreset => $opt_s, + greprules => $greprules, + logfile => $opt_l, + nologs => 1}); -$lower = 1; -#$threshold = 5; -$higher = 9; -$min_expected = 2; # Should not be set to more than 5 or less than 2 +$masses->readrules(); +$masses->readlogs(); -my %freq_spam = (); # how often non-nice found in spam +my $rules = $masses->get_rules_hash(); +my $logs = $masses->get_logs(); + +my $num_spam = $masses->get_num_spam(); +my $num_ham = $masses->get_num_ham(); + my %freq_over_higher_falsepos = (); # how often non-nice found in ones over # higher threshold that are false positives -my %freq_nonspam = (); # how often nice found in nonspam my %freq_under_lower_falseneg = (); # how often nice found in ones under # lower threshold that are false negatives @@ -59,43 +165,54 @@ my %ratio_expected_falsepos = (); # ratio version of above my %ratio_expected_falseneg = (); # ditto -my $num_spam = 0; -my $num_nonspam = 0; my $num_over_higher_falsepos = 0; my $num_under_lower_falseneg = 0; -my $ok_lang = ''; -readscores(); +my %chisquare = ( ); +my %prob = ( ); -$ok_lang = lc ($opt_l || $opt_L || ''); -if ($ok_lang eq 'all') { $ok_lang = '.'; } -foreach my $key (keys %rules) { +foreach my $key (keys %$rules) { - if ( ($opt_L && !$rules{$key}->{lang}) || - ($rules{$key}->{lang} && - (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i) - ) ) { - delete $rules{$key} ; next; - } - - if ($rules{$key}->{tflags} =~ m/net/) { - delete $rules{$key}; - next; - } - if ($rules{$key}->{tflags} !~ m/userconf/) { - if ($rules{$key}->{tflags} =~ m/nice/) { - $freq_nonspam{$key} = 0; + if ($rules->{$key}->{tflags} !~ /\buserconf\b/) { + if ($rules->{$key}->{tflags} =~ m/nice/) { $freq_under_lower_falseneg{$key} = 0; } else { - $freq_spam{$key} = 0; $freq_over_higher_falsepos{$key} = 0; } } + } -readlogs(); +foreach my $log (@$logs) { + if($log->{isspam}) { + # Also need to count plus_hits + my $plus_hits = 0; + foreach my $test (@{$log->{tests_hit}}) { + $plus_hits += $test->{score} if ($test->{score} > 0); + } + + if(($log->{score} <= $lower) && $plus_hits && $plus_hits >= $lower) { + $num_under_lower_falseneg++; + foreach my $test (@{$log->{tests_hit}}) { + $num_under_lower_falseneg++; + $freq_under_lower_falseneg{$test->{name}}++ if exists $freq_under_lower_falseneg{$test->{name}}; + } + } + } + else { + if($log->{score} > $higher) { + $num_over_higher_falsepos++; + foreach my $test (@{$log->{tests_hit}}) { + $num_over_higher_falsepos++; + $freq_over_higher_falsepos{$test->{name}}++ if exists $freq_over_higher_falsepos{$test->{name}}; + } + } + } + +} + unless (($num_over_higher_falsepos >= $min_expected) && ($num_under_lower_falseneg >= $min_expected)) { die "Insufficient extremes in dataset (" . $num_over_higher_falsepos . @@ -119,12 +236,13 @@ } my $ratio_falsepos = $num_over_higher_falsepos/$num_spam; -my $ratio_falseneg = $num_under_lower_falseneg/$num_nonspam; +my $ratio_falseneg = $num_under_lower_falseneg/$num_ham; my $skipped_non_nice = 0; -foreach $rule (keys %freq_spam) { - my $expected = $freq_spam{$rule}*$ratio_falsepos; +# non-nice rules +foreach my $rule (keys %freq_over_higher_falsepos) { + my $expected = $rules->{$rule}->{freq_spam}*$ratio_falsepos; if ($expected <= $min_expected) { $skipped_non_nice++; next; @@ -136,7 +254,7 @@ $freq_over_higher_falsepos{$rule}/$expected; ($chisquare{$rule},$prob{$rule}) = chisquare($num_spam,$num_over_higher_falsepos, - $freq_spam{$rule},$freq_over_higher_falsepos{$rule}); + $rules->{$rule}->{freq_spam},$freq_over_higher_falsepos{$rule}); if ($freq_over_higher_falsepos{$rule} < $expected) { $chisquare{$rule} *= -1; } @@ -146,8 +264,9 @@ my $skipped_nice = 0; -foreach $rule (keys %freq_nonspam) { - my $expected = $freq_nonspam{$rule}*$ratio_falseneg; +# nice rules +foreach my $rule (keys %freq_under_lower_falseneg) { + my $expected = $rules->{$rule}->{freq_ham}*$ratio_falseneg; if ($expected <= $min_expected) { $skipped_nice++; next; @@ -158,8 +277,8 @@ $ratio_expected_falseneg{$rule} = $freq_under_lower_falseneg{$rule}/$expected; ($chisquare{$rule},$prob{$rule}) = - chisquare($num_nonspam,$num_under_lower_falseneg, - $freq_nonspam{$rule},$freq_under_lower_falseneg{$rule}); + chisquare($num_ham,$num_under_lower_falseneg, + $rules->{$rule}->{freq_ham},$freq_under_lower_falseneg{$rule}); if ($freq_under_lower_falseneg{$rule} < $expected) { $chisquare{$rule} *= -1; } @@ -167,8 +286,12 @@ warn "Skipped nice: $skipped_nice\n"; -@rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos); +# The rest is copied verbatim from before - its complicated and not +# commented and should work unchanged except for the freq_spam and +# freq_ham stuff and fixing some use strict stuff +my @rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos); + if (scalar(@rules_falsepos)) { print "RULE\t\tCHISQUARE\tRATIO_FALSEPOS\tOVER_FALSEPOS\tFREQ_OVER ($num_over_higher_falsepos)\n"; my(@rules_falsepos_bad) = @@ -183,7 +306,7 @@ $over_expected_falsepos{$a}) || ($freq_over_higher_falsepos{$b} <=> $freq_over_higher_falsepos{$a})} (@rules_falsepos_bad); - foreach $rule (@rules_falsepos_bad) { + foreach my $rule (@rules_falsepos_bad) { print $rule . "\t" . $prob{$rule} . "\t" . $ratio_expected_falsepos{$rule} . "\t" . $over_expected_falsepos{$rule} . "\t" . @@ -199,9 +322,9 @@ ($chisquare{$a} <=> $chisquare{$b}) || ($ratio_expected_falsepos{$a} <=> $ratio_expected_falsepos{$b}) || - ($freq_spam{$b} <=> - $freq_spam{$a})} (@rules_falsepos_good); - foreach $rule (@rules_falsepos_good) { + ($rules->{$b}->{freq_spam} <=> + $rules->{$a}->{freq_spam})} (@rules_falsepos_good); + foreach my $rule (@rules_falsepos_good) { print $rule . "\t" . $prob{$rule} . "\t" . $ratio_expected_falsepos{$rule} . "\t" . $over_expected_falsepos{$rule} . "\t" . @@ -212,7 +335,7 @@ warn "No over-falsepos to print\n"; } -@rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg); +my @rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg); if (scalar(@rules_falseneg)) { print "RULE\t\tCHISQUARE\tRATIO_FALSENEG\tOVER_FALSENEG\tFREQ_UNDER ($num_under_lower_falseneg)\n"; @@ -228,7 +351,7 @@ $over_expected_falseneg{$a}) || ($freq_under_lower_falseneg{$b} <=> $freq_under_lower_falseneg{$a})} (@rules_falseneg_bad); - foreach $rule (@rules_falseneg_bad) { + foreach my $rule (@rules_falseneg_bad) { print $rule . "\t" . $prob{$rule} . "\t" . $ratio_expected_falseneg{$rule} . "\t" . $over_expected_falseneg{$rule} . "\t" . @@ -244,9 +367,9 @@ ($chisquare{$a} <=> $chisquare{$b}) || ($ratio_expected_falseneg{$a} <=> $ratio_expected_falseneg{$b}) || - ($freq_spam{$b} <=> - $freq_spam{$a})} (@rules_falseneg_good); - foreach $rule (@rules_falseneg_good) { + ($rules->{$b}->{freq_ham} <=> + $rules->{$a}->{freq_ham})} (@rules_falseneg_good); + foreach my $rule (@rules_falseneg_good) { print $rule . "\t" . $prob{$rule} . "\t" . $ratio_expected_falseneg{$rule} . "\t" . $over_expected_falseneg{$rule} . "\t" . @@ -258,97 +381,3 @@ } exit; - -sub readlogs { - my $spam = $ARGV[0] || "spam.log"; - my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log"); - - - (open(NONSPAM,$nonspam)) || - (die "Couldn't open file '$nonspam': $!; stopped"); - - while (defined($line = )) { - if ($line =~ m/^\s*\#/) { - next; - } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) { - my $tests = $1; - my $hits = 0; - my(@tests) = (); - foreach $test (grep {length($_)} (split(/,+/,$tests))) { - if (exists($rules{$test})) { - push @tests, $test; - $hits += $rules{$test}->{score}; - } - } - - if (scalar(@tests)) { - $num_nonspam++; - foreach $test (grep {exists($freq_nonspam{$_})} (@tests)) { - $freq_nonspam{$test}++; - } - if ($hits >= $higher) { - $num_over_higher_falsepos++; - foreach $test (grep - {exists($freq_over_higher_falsepos{$_})} (@tests)) { - $freq_over_higher_falsepos{$test}++; - } - } - } - } elsif ($line =~ m/\S/) { - chomp($line); - warn "Can't interpret line '$line'; skipping"; - } - } - - close(NONSPAM); - - (open(SPAM,$spam)) || (die "Couldn't open file '$spam': $!; stopped"); - - while (defined($line = )) { - if ($line =~ m/^\s*\#/) { - next; - } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) { - my $tests = $1; - my $hits = 0; - my $plus_hits = 0; - my(@tests) = (); - foreach $test (grep {length($_)} (split(/,+/,$tests))) { - if (exists($rules{$test})) { - push @tests, $test; - $hits += $rules{$test}->{score}; - if ($rules{$test}->{score} > 0) { - $plus_hits += $rules{$test}->{score}; - } - } - } - - if (scalar(@tests)) { - $num_spam++; - foreach $test (grep {exists($freq_spam{$_})} (@tests)) { - $freq_spam{$test}++; - } - if (($hits <= $lower) && $plus_hits && - ($plus_hits >= $lower)) { - $num_under_lower_falseneg++; - foreach $test (grep - {exists($freq_under_lower_falseneg{$_})} (@tests)) { - $freq_under_lower_falseneg{$test}++; - } - } - } - } elsif ($line =~ m/\S/) { - chomp($line); - warn "Can't interpret line '$line'; skipping"; - } - } - - close(SPAM); -} - - -sub readscores { - system ("./parse-rules-for-masses") and - die "Couldn't do parse-rules-for-masses: $?; stopped"; - require "./tmp/rules.pl"; -} -