Index: lib/Mail/SpamAssassin.pm =================================================================== --- lib/Mail/SpamAssassin.pm (revision 20231) +++ lib/Mail/SpamAssassin.pm (working copy) @@ -1263,11 +1263,13 @@ # read a file called "init.pre" in site rules dir *before* all others; # even the system config. + + # Save this in $self so that it can be accessed externally (for logging, etc.) + $self->{site_rules_filename} ||= $self->first_existing_path (@site_rules_path); my $siterules = $self->{site_rules_filename}; - $siterules ||= $self->first_existing_path (@site_rules_path); + $self->{rules_filename} ||= $self->first_existing_path (@default_rules_path); my $sysrules = $self->{rules_filename}; - $sysrules ||= $self->first_existing_path (@default_rules_path); if ($siterules) { $fname = File::Spec->catfile ($siterules, "init.pre"); @@ -1300,8 +1302,8 @@ $self->get_and_create_userstate_dir(); # user prefs file + $self->{userprefs_filename} ||= $self->first_existing_path (@default_userprefs_path); $fname = $self->{userprefs_filename}; - $fname ||= $self->first_existing_path (@default_userprefs_path); if (defined $fname) { if (!-f $fname && !$self->{dont_copy_prefs} && !$self->create_default_prefs($fname)) { Index: lib/Mail/SpamAssassin/Masses.pm =================================================================== --- lib/Mail/SpamAssassin/Masses.pm (revision 0) +++ lib/Mail/SpamAssassin/Masses.pm (revision 0) @@ -0,0 +1,788 @@ +# <@LICENSE> +# Copyright 2004 Apache Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +=head1 NAME + +Mail::SpamAssassin::Masses - Interface for reading and parsing rules +and mass-check logs for SpamAssassin + +=head1 SYNOPSIS + + my $parser = Mail::SpamAssassin::Masses->new(); + my $rules = $parser->readrules(); + my $logs = $parser->readlogs(); + + foreach my $test (keys %$rules) { + if ($rules->{$test}->{score} > 1) { + ... + } + +=head1 DESCRIPTION + +Mail::SpamAssassin::Masses is a module to simplify the many scripts +that used to make up the SpamAssassin re-scoring process. By +consolidating all the shared code in one module, the scripts can be +simplified and require fewer temporary files. + +=head1 METHODS + +=over 4 + +=cut + +package Mail::SpamAssassin::Masses; + +use strict; +use warnings; +use Carp; + +=item $parser = Mail::SpamAssassin::Masses->new( [ { opt => val, ... } ] ); + +Construct a new Mail::SpamAssassin::Masses object. You may pass the +following attribute-value pairs to the constructor. + +=over 4 + +=item rulesdir + +The directory containing rules. If multiple directories are desired, +an anonymous array should be passed. + +=item scoreset + +Scoreset to deal with. + +=item logfile + +Filename of mass-check log. + +=item falses + +Also count frequencies for false positives and false negatives from +the logs. + +=item falsesonly + +Only count false positives and false negatives. + +=item greprule + +Coderef that is passed a rule name and a hash ref with the entries +containing info about the rule. If the sub returns false, it is skipped. + +=item greplog + +Coderef that is passed a raw log entry. If it returns false, the entry +is skipped. + +=item sliding_window + +Use a sliding window for score ranges rather than a shrinking window. + +=item nologs + +Save memory by not saving the individual log results, just the +aggregate totals + +=back + +=cut + +sub new { + + my $class = shift; + $class = ref($class) || $class; + + my $self = shift; + if (!defined $self){ + $self = { }; + } + + $self->{scoreset} ||= 0; + $self->{rulesdir} ||= ''; + $self->{logfile} ||= "masses.log"; + + bless($self, $class); + + return $self; + +} + +=item $parser->readrules() + +Read and parse the rules from the directory specified as +C. This loads the following keys and values into the hash +entry representing the rules (see below). + +=over 4 + +=item name + +Contains the rule's name. + +=item score + +Contains the rule's score. + +=item type + +Contains the rule's type (header, body, uri, etc.) + +=item tflags + +Contains the rules tflags (nice, autolearn, etc.) as specified in the config file. + +=item lang + +Set to the value of C for language-specific tests. + +=item issubrule + +Set to true if the rules is a sub-rule, (i.e. it starts with +__). Otherwise, undefined. + +=item isnice + +This key exists and is true if the rule is nice (i.e. with a score +that can be below zero). + +=item describe + +Set to the rule's description, in English, or in the rule's language. + +=back + +There may be more values once C is run. + +=cut + + +sub readrules { + + my $self = shift; + + $self->{rules} ||= { }; + my $rules = $self->{rules}; # $rules is a reference to the anon hash + + my @dirs = ref($self->{rulesdir}) ? @{$self->{rulesdir}} : $self->{rulesdir}; + + my @files; + + foreach my $indir (@dirs) { + if (-d $indir) { + @files = glob("$indir/*.cf"); # no reason to only do numbered files + } else { + @files = ( $indir ); + } + + foreach my $file (@files) { + open (IN, "<$file") || croak("Can't open $file, $!"); + while() { + s/#.*$//g; + s/^\s+//; + s/\s+$//; + next if /^$/; + + my $lang = ''; + if (s/^lang\s+(\S+)\s+//) { + $lang = lc $1; + } + + if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) { + my $type = $1; + my $name = $2; + + $rules->{$name} ||= { }; + $rules->{$name}->{name} = $name; + $rules->{$name}->{type} = $type; + $rules->{$name}->{lang} = $lang if $lang; + $rules->{$name}->{tflags} = ''; + + if ($name =~ /^__/) { + $rules->{$name}->{issubrule} = '1'; + } + + } elsif (/^describe\s+(\S+)\s+(.+)$/) { + + # Let's get description in english, por favor -- unless the rule isn't english + + next if ($lang && (!$rules->{$1}->{lang} || $rules->{$1}->{lang} ne $lang)); + + $rules->{$1} ||= { }; + $rules->{$1}->{describe} = $2; + + } elsif (/^tflags\s+(\S+)\s+(.+)$/) { + my $name = $1; + $rules->{$name} ||= { }; + $rules->{$name}->{tflags} = $2; + if ($2 =~ /nice/) { + $rules->{$name}->{isnice} = 1; + } + } elsif (/^score\s+(\S+)\s+(.+)$/) { + my($name,$score) = ($1,$2); + $rules->{$name} ||= { }; + if ( $score =~ /\s/ ) { # there are multiple scores + ($score) = (split(/\s+/,$score))[$self->{scoreset}]; + } + $rules->{$name}->{score} = $score; + } + } + close IN; + } + } + foreach my $rule (keys %{$rules}) { + if (!defined $rules->{$rule}->{type}) { + delete $rules->{$rule}; # no rule definition -> no rule + next; + } + + if (!defined $rules->{$rule}->{score}) { + my $def = 1.0; + if ($rule =~ /^T_/) { $def = 0.01; } + + if ($rules->{$rule}->{isnice}) { + $rules->{$rule}->{score} = -$def; + } else { + $rules->{$rule}->{score} = $def; + } + } + + if ($self->{greprules} && !&{$self->{greprules}}($rule, $rules->{$rule})) + { + delete $rules->{$rule}; + next; + } + + } + + $self->{_readrules} = 1; +} + +=item $parser->readlogs() + +Read and parse logs from C. This will create the anonymous +array of hashes referred to by C<$parser->{logs}>, with the following +keys: + +=over 4 + +=item isspam + +True if the message is spam. False or undefined otherwise. + +=item isfalse + +True if the message was a false negative or positive. + +=item tests_hit + +Array reference containing references to the hash representing each +rule hit. + +=item score + +Score the message received (under current scores). + +=back + +In addition, this method adds the following keys to the rule +information in C<$parser->{rules}>. + +=over 4 + +=item freq_spam + +Frequency hit in spam. + +=item freq_ham + +Frequency hit in ham. + +=item freq_fp + +Frequency in false positives. + +=item freq_fn + +Frequency in false negatives. + +=back + +Also, sets C<$parser->{num_spam}> and C<$parser->{num_ham}> to the number of +spam logs read and the number of ham logs read, respectively. + +=cut + +sub readlogs { + + my $self = shift; + + if (!$self->{_readrules}) { + # need to read scores first! + $self->readrules(); + } + + my $rules = $self->{rules}; # copy the ref, shorthand + + my $logs; + if (! $self->{nologs}) { + $self->{logs} ||= [ ]; + $logs = $self->{logs}; + } + + + my ($num_spam, $num_ham, $count, $num_fp, $num_fn); + $num_spam = $num_ham = $count = $num_fp = $num_fn = 0; + + # First, initialize stuff + foreach my $rule (values %{$self->{rules}}) { + $rule->{freq_spam} ||= 0; + $rule->{freq_ham} ||= 0; + + if($self->{falses}) { + $rule->{freq_fp} ||= 0; + $rule->{freq_fn} ||= 0; + } + + } + + my $file = $self->{logfile}; + open (IN, "<$file"); + + while () { + next if /^\#/; + next if /^$/; + if($_ !~ /^(.)\s+(.)\s+-?[\d.]+\s+\S+(\s+\S+\s+)/) { warn "bad line: $_"; next; } + + if ($self->{greplogs} && !&{$self->{greplogs}}($_)) { + next; + } + + my $manual = $1; + my $result = $2; + $_ = $3; + s/(?:bayes|time)=\S+//; + s/,,+/,/g; + s/^\s+//; + s/\s+$//; + + + if ($manual ne $result) { + $self->{isfalse} = 1; + } + elsif ($self->{falsesonly}) { + next; + } + + if ($manual eq "s") { + $num_spam++; + $logs->[$count]->{isspam} = 1 unless $self->{nologs}; + $num_fn++ if $result eq "h"; + } else { + $num_ham++; + $num_fp++ if $result eq "s"; + } + + my @tests = (); + my $score = 0; + foreach my $tst (split (/,/, $_)) { + next if ($tst eq ''); + + # Don't count non-existant rules + # (Could happen with greprules) + next if ( !$rules->{$tst} || !$rules->{$tst}->{type} ); + + if ($manual eq "s") { + $rules->{$tst}->{freq_spam}++; + $rules->{$tst}->{freq_fn}++ if ($self->{falses} && $result eq "h"); + } + else { + $rules->{$tst}->{freq_ham}++; + $rules->{$tst}->{freq_fp}++ if ($self->{falses} && $result eq "s"); + } + + $score += $rules->{$tst}->{score}; + + push (@tests, $rules->{$tst}) unless $self->{nologs}; + } + + $logs->[$count]->{tests_hit} = \@tests unless $self->{nologs}; + $logs->[$count]->{score} = $score; + + $count++; + } + close IN; + + $self->{num_spam} = $num_spam; + $self->{num_ham} = $num_ham; + if ($self->{falses}) { + $self->{num_fn} = $num_fn; + $self->{num_fp} = $num_fp; + } + + $self->{_readlogs} = 1; # Done reading logs + +} + +=item $parser->do_statistics(); + +Calculate the S/O ratio and the rank for each test. + +This adds the following keys to the rules hashes. + +=over 4 + +=item spam_percent + +Percentage of spam messages hit. + +=item ham_percent + +Percentage of ham messages hit. + +=item soratio + +S/O ratio -- percentage of spam messages hit divided by total +percentage of messages hit. + +=back + +=cut + +sub do_statistics { + my $self = shift; + + if (! $self->{_readlogs} ) { + $self->readlogs(); + } + + my $rank_hi=0; + my $rank_lo=999999; + + foreach my $rule (values %{$self->{rules}}) { + + if (!$rule->{freq_spam}) { + $rule->{spam_percent} = 0; + } else { + $rule->{spam_percent} = $rule->{freq_spam} / $self->{num_spam} * 100.0; + } + + if (!$rule->{freq_ham}) { + $rule->{ham_percent} = 0; + } else { + $rule->{ham_percent} = $rule->{freq_ham} / $self->{num_ham} * 100.0; + } + + if (!$rule->{freq_spam} && !$rule->{freq_ham}) { + $rule->{soratio} = 0.5; + next; + } + + $rule->{soratio} = $rule->{spam_percent} / ($rule->{spam_percent} + $rule->{ham_percent}); + + } + + $self->{_statistics} = 1; + +} + +=item $parser->do_rank(); + +Calculates the ranking for each rule and stores this in the +appropriate key. + +=over 4 + +=item rank + +"Rank" of the rule. High numbers are good, low are bad. + +=back + +=cut + +sub do_rank { + + my $self = shift; + + if (! $self->{_statistics} ) { + $self->do_statistics(); + } + + my $rank_hi = 0; + my $rank_lo = 9999999; + + my %unwanted; + my %wanted; + my %wranks = (); + my %uranks = (); + my $rules = $self->{rules}; + + + foreach my $rule (values %{$self->{rules}}) { + + $wanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_ham} : $rule->{freq_spam}; + $unwanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_spam} : $rule->{freq_ham}; + + $wranks{$wanted{$rule->{name}}} = 1; + $uranks{$unwanted{$rule->{name}}} = 1; + + } + + my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted; + my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted; + + # first half of ranking is the wanted rank + my $position = 0; + my $last = undef; + + foreach my $test (@wanted) { + $position++ if defined $last && $last != $wanted{$test}; + $rules->{$test}->{rank} += $position; + $last = $wanted{$test}; + } + + # second half is the unwanted rank + $position = 0; + $last = undef; + + # Avoid divide by 0 errors! + die "Error: no rules read" if (!(scalar keys %uranks)); + + my $normalize = (scalar keys %wranks) / (scalar keys %uranks); + + foreach my $test (@unwanted) { + $position++ if defined $last && $last != $unwanted{$test}; + $rules->{$test}->{rank} += ($position * $normalize); + $last = $unwanted{$test}; + $rank_hi = $rules->{$test}->{rank} if ($rules->{$test}->{rank} > $rank_hi); + $rank_lo = $rules->{$test}->{rank} if ($rules->{$test}->{rank} < $rank_lo); + } + + $rank_hi = $rank_hi - $rank_lo; + foreach my $rule (values %{$rules}) { + $rule->{rank} = ($rank_hi == 0) ? 0.001 : (($rule->{rank} - $rank_lo)/ $rank_hi); + } + + $self->{_rank} = 1; +} + +=item $parser->get_rules_array(); + +Returns a reference to an array of hash references. The values of +these hash have keys as listed above. + +=cut + +sub get_rules_array { + my $self = shift; + return [ values %{$self->{rules}} ]; +} + +=item $parser->get_rules_hash(); + +Returns a reference to a hash with rule names as keys and hash +references as values. The values of these hash have keys as listed +above. + +=cut + +sub get_rules_hash { + my $self = shift; + return $self->{rules}; +} + +=item $parser->get_logs(); + +Returns a reference to the array containing log entries, in the form +of anonymous hashes with keys as described above. + +=cut + +sub get_logs { + my $self = shift; + return $self->{logs}; +} + +=item $parser->get_num_ham(); + +Returns number of ham logs read. + +=cut + +sub get_num_ham { + my $self = shift; + return $self->{num_ham}; +} + +=item $parser->get_num_spam(); + +Returns number of spam logs read. + +=cut + +sub get_num_spam { + my $self = shift; + return $self->{num_spam}; +} + +=item $parser->do_score_ranges(); + +Figure out range in which score can be set based on the soratio, etc. + +This is necessary so that the perceptron doesn't set silly +scores. (This may not be as much of a problem as it was with the old +GA.) + +This adds the following keys to the rules hashes: + +=over 4 + +=item ismutable + +Determines whether the perceptron can select a score for this test. + +=item range_lo + +Determines the lowest score the perceptron can set. + +=item range_hi + +Determines the highest score the perceptron can set. + +=cut + +sub do_score_ranges() { + + my $self = shift; + + if ( !$self->{_statistics} ) { + $self->do_statistics(); + } + if ( !$self->{_rank} ) { + $self->do_rank(); + } + + foreach my $rule (values %{$self->{rules}}) { + + my ($rank, $lo, $hi); + + $rank = $rule->{rank}; + + # Get rid of rules that don't hit -- and disable completely. + if ($rule->{spam_percent} + $rule->{ham_percent} < 0.01 || + $rule->{score} == 0) { + + $rule->{ismutable} = 0; + $rule->{range_lo} = $rule->{range_hi} = 0; + next; + + } + + # next: get rid of tests that don't apply in this scoreset + # or are userconf -- set ismutable to 0, but keep the score + if ($rule->{tflags} =~ /\buserconf\b/ || + (($self->{scoreset} % 2) == 0 && $rule->{tflags} =~/\bnet\b/)) { + + $rule->{ismutable} = 0; + $rule->{range_lo} = $rule->{range_hi} = $rule->{score}; + next; + + } + + + # Normal rules: + + # This seems to convert from [-1,1] to [0,1] but we're already in + # [0,1] space - Is this right? + + # The current way ranks are calculated, > 0.5 and < 0.5 have no + # special meaning + +# # 0.0 = best nice, 1.0 = best nonnice +# if ($rule->{isnice}) { +# $rank = .5 - ($rank / 2); +# } else { +# $rank = .5 + ($rank / 2); +# } + + # using this seems to work better + + if($rule->{isnice}) { + $hi = 0; + $lo = $rule->{rank} * -4.5; + } else { + $hi = $rule->{rank} * 4.5; + $lo = 0 + } + + # Modify good rules to be lower + if ($rule->{isnice}) { + if ($rule->{tflags} =~ /\blearn\b/) { # learn rules should get + # higher scores (-5.4) + $lo *= 1.8; + } + elsif ( $rule->{soratio} <= 0.05 && $rule->{ham_percent} > 0.5) { + $lo *= 1.5; + } + + # argh, ugly... but i'm copying it whole... + $hi = ($rule->{soratio} == 0) ? $lo : + ($rule->{soratio} <= 0.005 ) ? $lo/1.1 : + ($rule->{soratio} <= 0.010 && $rule->{ham_percent} > 0.2) ? $lo/2.0 : + ($rule->{soratio} <= 0.025 && $rule->{ham_percent} > 1.5) ? $lo/10.0 : + 0; + + if ($rule->{soratio} >= 0.35 ) { + ($lo, $hi) = (0,0); + } + } + else { # Make non-nice rules have higher scores if they're good + if ($rule->{tflags} =~ /\blearn\b/ ) { + $hi *= 1.8; + } + elsif ( $rule->{soratio} >= 0.99 && $rule->{spam_percent} > 1.0) { + $hi *= 1.5; + } + + $lo = ($rule->{soratio} == 1) ? $hi: + ($rule->{soratio} >= 0.995 ) ? $hi/4.0 : + ($rule->{soratio} >= 0.990 && $rule->{spam_percent} > 1.0) ? $hi/8.0 : + ($rule->{soratio} >= 0.900 && $rule->{spam_percent} > 10.0) ? $hi/24.0 : + 0; + + if ($rule->{soratio} <= 0.65 ) { # auto-disable bad rules + ($lo, $hi) = (0,0); + } + } + + + # Some sanity checking + if($hi < $lo) { + ($lo, $hi) = ($hi, $lo); + } + + + $rule->{ismutable} = ($lo == $hi) ? 0 : 1; + $rule->{range_lo} = $lo; + $rule->{range_hi} = $hi; + + } +} + + +# Pacify perl +1; Property changes on: masses ___________________________________________________________________ Name: svn:ignore - craig-evolve.scores craig-evolve.scores.all craig-evolve.scores.neg craig-evolve.scores.pos falseneg.log falsepos.log trueneg.log truepos.log spam*.log ham*.log nonspam*.log newscores analysis spam.sonic nonspam.sonic spam.local nonspam.local commands.sh scores.h tests.h tmp old-random-search result.* results.* overnight.* galib245 pgapack evolve.scores evolve craig-evolve goodresults freqs logs spam.dogma nonspam.dogma spam.alltime PI* nonspam.jm-traps nonspam.log* spam.jm-traps spam.log* dprof.* tmon.* spamassassin spamassassin.prefs jm-mass-check-rules + craig-evolve.scores craig-evolve.scores.all craig-evolve.scores.neg craig-evolve.scores.pos falseneg.log falsepos.log trueneg.log truepos.log spam*.log ham*.log nonspam*.log newscores analysis spam.sonic nonspam.sonic spam.local nonspam.local commands.sh scores.h tests.h tmp old-random-search result.* results.* overnight.* galib245 pgapack evolve.scores evolve craig-evolve goodresults freqs logs spam.dogma nonspam.dogma spam.alltime PI* nonspam.jm-traps nonspam.log* spam.jm-traps spam.log* dprof.* tmon.* spamassassin spamassassin.prefs jm-mass-check-rules masses.log* masses*.log tenpass_results perceptron.scores Index: masses/parse-rules-for-masses =================================================================== --- masses/parse-rules-for-masses (revision 20231) +++ masses/parse-rules-for-masses (working copy) @@ -1,148 +0,0 @@ -#!/usr/bin/perl -# -# <@LICENSE> -# Copyright 2004 Apache Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -sub usage { - die " -parse-rules-for-masses: parse the SpamAssassin rules files for mass-checks, - evolving, and frequency analysis - -usage: ./parse-rules-for-masses [-d rulesdir] [-o outputfile] [-s scoreset] - -rulesdir defaults to ../rules -outputfile defaults to ./tmp/rules.pl -scoreset default to 0 - -"; -} - -use Getopt::Long; -use Data::Dumper; - -use vars qw(@rulesdirs $outputfile $scoreset); -GetOptions ( - "d=s" => \@rulesdirs, - "o=s" => \$outputfile, - "s=i" => \$scoreset, - "help|h|?" => sub { usage(); } ); - -if ($#rulesdirs < 0) { - @rulesdirs = ("../rules"); -} - -if (!defined $outputfile) { - $outputfile = "./tmp/rules.pl"; - mkdir ("tmp", 0755); -} - -$scoreset = 0 if ( !defined $scoreset ); - -my $rules = { }; -readrules(@rulesdirs); - -my $scores = { }; -foreach my $key (keys %{$rules}) { - $scores->{$key} = $rules->{$key}->{score}; -} - -writerules($outputfile); -exit; - -sub readrules { - foreach my $indir (@_) { - my @files = <$indir/[0-9]*.cf>; - my $file; - %rulesfound = (); - %langs = (); - foreach $file (sort @files) { - open (IN, "<$file"); - while () { - s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/; - - my $lang = ''; - if (s/^lang\s+(\S+)\s+//) { - $lang = $1; - } - - if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) { - my $type = $1; - my $name = $2; - - my $issubrule = '0'; - if ($name =~ /^__/) { $issubrule = '1'; } - - $rules->{$1} ||= { }; - $rules->{$name}->{type} = $type; - $rules->{$name}->{lang} = $lang; - $rules->{$name}->{issubrule} = $issubrule; - $rules->{$name}->{tflags} = ''; - - } elsif (/^describe\s+(\S+)\s+(.+)$/) { - $rules->{$1} ||= { }; - $rules->{$1}->{describe} = $2; - - } elsif (/^tflags\s+(\S+)\s+(.+)$/) { - $rules->{$1} ||= { }; - $rules->{$1}->{tflags} = $2; - - } elsif (/^score\s+(\S+)\s+(.+)$/) { - my($name,$score) = ($1,$2); - $rules->{$name} ||= { }; - if ( $score =~ /\s/ ) { # there are multiple scores - ($score) = (split(/\s+/,$score))[$scoreset]; - } - $rules->{$name}->{score} = $score; - } - } - close IN; - } - } - - foreach my $rule (keys %{$rules}) { - if (!defined $rules->{$rule}->{type}) { - delete $rules->{$rule}; # no rule definition -> no rule - next; - } - - if (!defined $rules->{$rule}->{score}) { - my $def = 1.0; - if ($rule =~ /^T_/) { $def = 0.01; } - - if ($rules->{$rule}->{tflags} =~ /nice/) { - $rules->{$rule}->{score} = -$def; - } else { - $rules->{$rule}->{score} = $def; - } - } - } -} - -sub writerules { - my $outfile = shift; - # quick hack to create the tmp directory - system ("mkdir -p $outfile 2>/dev/null ; rmdir $outfile 2>/dev/null"); - - open (OUT, ">$outfile") or die "cannot write to $outfile"; - print OUT "# dumped at ".`date`."\n"; - - $Data::Dumper::Purity = 1; - print OUT Data::Dumper->Dump ([$rules, $scores], ['*rules', '*scores']); - - print OUT "1;"; - close OUT; -} - Index: masses/hit-frequencies =================================================================== --- masses/hit-frequencies (revision 20231) +++ masses/hit-frequencies (working copy) @@ -16,385 +16,256 @@ # limitations under the License. # + use FindBin; -use Getopt::Std; -getopts("fm:M:X:l:L:pxhc:at:s:i"); +use lib "$FindBin::Bin/../lib"; +use Mail::SpamAssassin::Masses; +use Getopt::Long qw(:config bundling auto_help); +use Pod::Usage; +use strict; +use warnings; + use vars qw { $opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c - $opt_a $opt_t $opt_s $opt_i $sorting + $opt_a $opt_t $opt_s $opt_z $opt_inclang $opt_auto }; -sub usage { - die "hit-frequencies [-c rules dir] [-f] [-m RE] [-M RE] [-X RE] [-l LC] - [-s SC] [-a] [-p] [-x] [-i] [spam log] [ham log] +GetOptions("c|cffile=s@" => \$opt_c, + "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode) + "l|logfile=s" => \$opt_l, + "f|falses" => \$opt_f, + "a|all" => \$opt_a, + "p|percentages" => \$opt_p, + "x|extended" => \$opt_x, + "m|matchrule=s" => \$opt_m, #, + "t|tflags=s" => \$opt_t, + "M|matchlog=s" => \$opt_M, + "X|excludelog=s" => \$opt_X, + "L|language=s" => \$opt_L, + "include-language=s" => \$opt_inclang); - -c p use p as the rules directory - -f falses. count only false-negative or false-positive matches - -m RE print rules matching regular expression - -t RE print rules with tflags matching regular expression - -M RE only consider log entries matching regular expression - -X RE don't consider log entries matching regular expression - -l LC also print language specific rules for lang code LC (or 'all') - -L LC only print language specific rules for lang code LC (or 'all') - -a display all tests - -p percentages. implies -x - -x extended output, with S/O ratio and scores - -s SC which scoreset to use - -i use IG (information gain) for ranking - options -l and -L are mutually exclusive. +=head1 NAME - options -M and -X are *not* mutually exclusive. +hit-frequencies - Display statistics about tests hit by a mass-check run - if either the spam or and ham logs are unspecified, the defaults - are \"spam.log\" and \"ham.log\" in the cwd. +=head1 SYNOPSIS -"; -} +hit-frequencies [options] -usage() if($opt_h || ($opt_l && $opt_L)); + Options: + -c,--cffile=path Use path as the rules directory + -s,--scoreset=n Use scoreset n + -l,--logfile=file Read in file instead of masses.log + -f Count only false-positives/false-negatives + -a Report all tests (including subrules) + -p Report percentages instead of raw hits + -x "Extended" output, include RANK, S/O and SCORE + -m,--matchrule=re Print rules matching the regular expression + -t,--tflags=re Print only rules with tflags matching the regular expression + -M,--matchlog=re Consider only logs matching the regular expression + -X,--excludelog=re Exclude logs matching this regular expression + -L,--language=lc Only print language specific tests for specified lang code (try 'all') + --include-language=lc Also print language specific tests for specified lang code (try 'all') -if ($opt_p) { - $opt_x = 1; -} +=head1 DESCRIPTION -$opt_s = 0 if ( !defined $opt_s ); +B will read the mass-check log F or the +log given by the B<--logfile> option. The output will contain a +summary of the number of ham and spam messages and detailed statistics +for each rule. By default, B will try to guess the +proper values for B<--cffile> based on the header of the +masses.log. The output will include the following columns: -my $cffile = $opt_c || "$FindBin::Bin/../rules"; +=over 4 -my %freq_spam = (); -my %freq_ham = (); -my $num_spam = 0; -my $num_ham = 0; -my %ranking = (); -my $ok_lang = ''; +=item OVERALL -readscores($cffile); +Number of times (or percentage with B<-p>) the rule hit on +all messages (spam or ham). -$ok_lang = lc ($opt_l || $opt_L || ''); -if ($ok_lang eq 'all') { $ok_lang = '.'; } +=item SPAM -foreach my $key (keys %rules) { +Number of times (or percentage with B<-p>) the rule hit on +spam messages. - if ( ($opt_L && !$rules{$key}->{lang}) || - ($rules{$key}->{lang} && - (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i) - ) ) { - delete $rules{$key} ; next; - } +=item HAM - $freq_spam{$key} = 0; - $freq_ham{$key} = 0; -} +Number of times (or percentage with B<-p>) the rule hit on +ham messages. -readlogs(); +=item S/O -my $hdr_all = $num_spam + $num_ham; -my $hdr_spam = $num_spam; -my $hdr_ham = $num_ham; +Shown only with B<-x> or B<-p>, this is the number of spam hits +divided by total number of hits (C refers to spam divided by +overall). -if ($opt_p) { - my $sorting = $opt_i ? "IG" : "RANK"; - if ($opt_f) { - printf "%7s %7s %7s %6s %6s %6s %s\n", - "OVERALL%", "FNEG%", "FPOS%", "S/O", $sorting, "SCORE", "NAME"; - } else { - printf "%7s %7s %7s %6s %6s %6s %s\n", - "OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME"; - } - printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", - $hdr_all, $hdr_spam, $hdr_ham, - soratio ($num_spam,$num_ham), 0, 0; +=item RANK - $hdr_spam = ($num_spam / $hdr_all) * 100.0; - $hdr_ham = ($num_ham / $hdr_all) * 100.0; - $hdr_all = 100.0; # this is obvious - printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n", - $hdr_all, $hdr_spam, $hdr_ham, - soratio ($num_spam,$num_ham), 0, 0; +Shown only with B<-x> or B<-p>, this is a measure that attempts to +indicate how I or I a test is. The higher it is, the +better the test. -} elsif ($opt_x) { - printf "%7s %7s %7s %6s %6s %6s %s\n", - "OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME"; - printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", - $hdr_all, $hdr_spam, $hdr_ham, - soratio ($num_spam,$num_ham), 0, 0; +=item SCORE -} else { - printf "%10s %10s %10s %s\n", - "OVERALL", "SPAM", "HAM", "NAME"; - printf "%10d %10d %10d (all messages)\n", - $hdr_all, $hdr_spam, $hdr_ham; -} +Shown only with B<-x> or B<-p>, this is the current score assigned to +the rule. -my %done = (); -my @tests = (); -my $rank_hi = 0; -my $rank_lo = 9999999; +=item NAME -# variables for wanted/unwanted RANK -my %wanted; -my %unwanted; -my %wranks; -my %uranks; +This is the rule's name. -foreach my $test (keys %freq_spam, keys %freq_ham) { - next unless (exists $rules{$test}); # only valid tests - next if (!$opt_a && $rules{$test}->{issubrule}); +=back - next if $done{$test}; $done{$test} = 1; - push (@tests, $test); +=head1 BUGS - my $isnice = 0; - if ($rules{$test}->{tflags} =~ /nice/) { $isnice = 1; } +Please report bugs to http://bugzilla.spamassassin.org/ - my $fs = $freq_spam{$test}; $fs ||= 0; - my $fn = $freq_ham{$test}; $fn ||= 0; - my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0; - my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0; +=head1 SEE ALSO - my $soratio = $soratio{$test} = soratio ($fsadj, $fnadj); +L, L, L - if ($isnice) { - $soratio = 1.0 - $soratio; - my $tmp = $fsadj; $fsadj = $fnadj; $fnadj = $tmp; - } +=cut - if ($opt_i) { - # come up with a ranking - my $rank; - - # New new system: from "Learning to Filter Unsolicited Commercial E-Mail", - # Ion Androutsopoulos et al: determine the information gain IG(X, C) of the - # Boolean attributes (ie. the rules). Measures "the average reduction in - # the entropy of C (classification) given the value of X (the rule)". Makes - # a good ranking measure with a proper statistical basis. ;) - # - # Still would like to get an entropy measure in, too. - # - # sum P(X = x ^ C = c) - # IG(X,C) = x in [0, 1] P(X = x ^ C = c) . log2( ------------------- ) - # c in [Ch, Cs] P(X = x) . P(C = c) - # - my $safe_nspam = $num_spam || 0.0000001; - my $safe_nham = $num_ham || 0.0000001; - - my $num_all = ($num_spam + $num_ham); - my $safe_all = $num_all || 0.0000001; - my $f_all = $fs+$fn; - - my $px0 = (($num_all - $f_all) / $safe_all); # P(X = 0) - my $px1 = ($f_all / $safe_all); # P(X = 1) - my $pccs = ($num_spam / $safe_all); # P(C = Cs) - my $pcch = ($num_ham / $safe_all); # P(C = Ch) - my $px1ccs = ($fs / $safe_nspam); # P(X = 1 ^ C = Cs) - my $px1cch = ($fn / $safe_nham); # P(X = 1 ^ C = Ch) - my $px0ccs = (($num_spam - $fs) / $safe_nspam); # P(X = 0 ^ C = Cs) - my $px0cch = (($num_ham - $fn) / $safe_nham); # P(X = 0 ^ C = Ch) - my $safe_px0_dot_pccs = ($px0 * $pccs) || 0.00000001; - my $safe_px0_dot_pcch = ($px0 * $pcch) || 0.00000001; - my $safe_px1_dot_pccs = ($px1 * $pccs) || 0.00000001; - my $safe_px1_dot_pcch = ($px1 * $pcch) || 0.00000001; - - sub log2 { return log($_[0]) / 0.693147180559945; } # log(2) = 0.6931... - - my $safe_px0ccs = ($px0ccs || 0.0000001); - my $safe_px0cch = ($px0cch || 0.0000001); - my $safe_px1ccs = ($px1ccs || 0.0000001); - my $safe_px1cch = ($px1cch || 0.0000001); - $rank = ( $px0ccs * log2($safe_px0ccs / $safe_px0_dot_pccs) ) + - ( $px0cch * log2($safe_px0cch / $safe_px0_dot_pcch) ) + - ( $px1ccs * log2($safe_px1ccs / $safe_px1_dot_pccs) ) + - ( $px1cch * log2($safe_px1cch / $safe_px1_dot_pcch) ); - - $ranking{$test} = $rank; - $rank_hi = $rank if ($rank > $rank_hi); - $rank_lo = $rank if ($rank < $rank_lo); - } - else { - # basic wanted/unwanted ranking - $wanted{$test} = $isnice ? $fn : $fs; - $unwanted{$test} = $isnice ? $fs : $fn; - # count number of ranks of each type - $wranks{$wanted{$test}} = 1; - $uranks{$unwanted{$test}} = 1; - } +if ($opt_L && $opt_inclang) { + pod2usage("-L/--language and --include-language are mutually exclusive"); } -# finish basic wanted/unwanted ranking -if (! $opt_i) { - my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted; - my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted; - - # first half of ranking is the wanted rank - my $position = 0; - my $last = undef; - for my $test (@wanted) { - $position++ if defined $last && $last != $wanted{$test}; - $ranking{$test} += $position; - $last = $wanted{$test} - } - - # second half of ranking is the unwanted rank - my $normalize = (scalar keys %wranks) / (scalar keys %uranks); - $position = 0; - $last = undef; - for my $test (@unwanted) { - $position++ if defined $last && $last != $unwanted{$test}; - $ranking{$test} += ($position * $normalize); - $last = $unwanted{$test}; - $rank_hi = $ranking{$test} if ($ranking{$test} > $rank_hi); - $rank_lo = $ranking{$test} if ($ranking{$test} < $rank_lo); - } +if ($opt_p) { + $opt_x = 1; } -{ - # now normalise the rankings to [0, 1] - $rank_hi -= $rank_lo; - foreach $test (@tests) { - $ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi); - } -} +$opt_s = 0 if ( !defined $opt_s ); -foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) { - next unless (exists $rules{$test}); # only valid tests - next if (!$opt_a && $rules{$test}->{issubrule}); +my $ok_lang = lc ( $opt_inclang || $opt_L || ''); +$ok_lang = '.' if ($ok_lang eq 'all'); - my $fs = $freq_spam{$test}; $fs ||= 0; - my $fn = $freq_ham{$test}; $fn ||= 0; - my $fa = $fs+$fn; +my $greprules = sub { # To determine whether rule should be read + my ($name, $rule) = @_; - next if ($opt_m && $test !~ m/$opt_m/); # match certain tests - next if ($opt_t && $rules{$test}->{tflags} !~ /$opt_t/); # match tflags + return 0 if ($opt_m && $name !~ /$opt_m/); # name doesn't match -m + # expression + return 0 if ($opt_t && $rule->{tflags} !~ /$opt_t/); # tflags don't + # match -t + # expression + return 0 if (($opt_L && !$rule->{lang}) || + ($rule->{lang} && + (!$ok_lang || $rule->{lang} !~ /^$ok_lang/i))); # Wrong language + return 0 if ($rule->{issubrule} && !$opt_a); + if (!$opt_a && !$opt_t) { - next if ($rules{$test}->{tflags} =~ /net/ && ($opt_s % 2 == 0)); # not net tests - next if ($rules{$test}->{tflags} =~ /userconf/); # or userconf + return 0 if ($rule->{tflags} =~ /net/ && ($opt_s % 2 == 0)); + return 0 if ($rule->{tflags} =~ /userconf/); # or userconf } + return 1; - # adjust based on corpora sizes (and cvt to % while we're at it) - my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0; - my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0; +}; - if ($opt_f && $fsadj == 0 && $fnadj == 0) { next; } - if ($opt_p) { - $fa = ($fa / ($num_spam + $num_ham)) * 100.0; - $fs = $fsadj; - $fn = $fnadj; - } +my $logfile = $opt_l || "masses.log"; - my $soratio = $soratio{$test}; - if (!defined $soratio) { - $soratio{$test} = soratio ($fsadj, $fnadj); - } +if (!$opt_c || !scalar(@$opt_c)) { + # Try to read this in from the log, if possible + open IN, $logfile or die "Can't open $logfile: $!"; + my $files = 0; # are we in the files section? + while() { + if (!$files) { + if (/^\# SVN revision:/) { + $opt_c = [ "$FindBin::Bin/../rules" ]; + last; + } elsif (/^\# Using configuration:$/) { + $files = 1; + } + } elsif (/^\#\s+(.*)\s*$/) { + push (@$opt_c, $1); + } else { + # All done! + last; + } + } - if ($opt_p) { - printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f %s\n", - $fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test; + if (!defined $opt_c) { + $opt_c = [ "$FindBin::Bin/../rules" ]; + } - } elsif ($opt_x) { - printf "%7d %7d %7d %7.3f %6.2f %6.2f %s\n", - $fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test; - - } else { - printf "%10d %10d %10d %s\n", $fa, $fs, $fn, $test; - } + foreach my $file (@$opt_c) { + die "Can't read $file" unless -r $file; + } } -exit; + +my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c, + scoreset => $opt_s, + falsesonly => $opt_f, + greprules => $greprules, + logfile => $logfile, + nologs => 1}); +$masses->readrules(); +$masses->readlogs(); +$masses->do_statistics(); +$masses->do_rank(); +my $rules = $masses->get_rules_hash(); +my $num_ham = $masses->get_num_ham(); +my $num_spam = $masses->get_num_spam(); +my $num_all = $num_ham + $num_spam; -sub readlogs { - my $spam = $ARGV[0] || "spam.log"; - my $ham = $ARGV[1] || (-f "good.log" ? "good.log" : "ham.log"); +if ($num_ham + $num_spam <= 0) { + die "Can't run hit-frequencies on 0 messages."; +} - foreach my $file ($spam, $ham) { - open (IN, "<$file") || die "Could not open file '$file': $!"; +## Write header - my $isspam = 0; ($file eq $spam) and $isspam = 1; +if ($opt_p) { - while () { - next if (/^#/); - next unless (!$opt_M || /$opt_M/o); - next if ($opt_X && /$opt_X/o); + if ($opt_f) { + printf "%7s %7s %7s %6s %6s %6s %s\n", + "OVERALL%", "FNEG%", "FPOS%", "S/O", "RANK", "SCORE", "NAME"; + } else { + printf "%7s %7s %7s %6s %6s %6s %s\n", + "OVERALL%", "SPAM%", "HAM%", "S/O", "RANK", "SCORE", "NAME"; + } - /^(.)\s+(-?\d+)\s+(\S+)\s*(\S*)/ or next; - my $caught = ($1 eq 'Y'); - my $hits = $2; - $_ = $4; s/,,+/,/g; + printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", + $num_all, $num_spam, $num_ham, + $num_spam / $num_all, 0, 0; - if ($isspam) { - if ($opt_f) { - if (!$caught) { $num_spam++; } - } else { - $num_spam++; - } - } else { - if ($opt_f) { - if ($caught) { $num_ham++; } - } else { - $num_ham++; - } - } + printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n", + 100.0, $num_spam / $num_all * 100.0, $num_ham / $num_all * 100.0, + $num_spam / $num_all, 0, 0; - my @tests = split (/,/, $_); - foreach my $t (@tests) { - next if ($t eq ''); - if ($isspam) { - if ($opt_f) { - if (!$caught) { $freq_spam{$t}++; } - } else { - $freq_spam{$t}++; - } - } else { - if ($opt_f) { - if ($caught) { $freq_ham{$t}++; } - } else { - $freq_ham{$t}++; - } - } - } - } - close IN; - } -} +} elsif ($opt_x) { + printf "%7s %7s %7s %6s %6s %6s %s\n", + "OVERALL", "SPAM", "HAM", "S/O", "RANK", "SCORE", "NAME"; + printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", + $num_all, $num_spam, $num_ham, + $num_spam / $num_all, 0, 0; - -sub readscores { - my($cffile) = @_; - system ("$FindBin::Bin/parse-rules-for-masses -d \"$cffile\" -s $opt_s") and die; - require "./tmp/rules.pl"; +} else { + printf "%10s %10s %10s %s\n", + "OVERALL", "SPAM", "HAM", "NAME"; + printf "%10d %10d %10d (all messages)\n", + $num_all, $num_spam, $num_ham; } -sub soratio { - my ($s, $n) = @_; +foreach my $test (sort { $rules->{$b}->{rank} <=> $rules->{$a}->{rank} } keys %{$rules}) { - $s ||= 0; - $n ||= 0; - - if ($s + $n > 0) { - return $s / ($s + $n); + if ($opt_p) { + printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f %s\n", + ($rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham}) / $num_all * 100.0, + $rules->{$test}->{spam_percent}, $rules->{$test}->{ham_percent}, + $rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test; + } elsif ($opt_x) { + printf "%7d %7d %7d %7.3f %6.2f %6.2f %s\n", + $rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham}, + $rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham}, + $rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test; } else { - return 0.5; # no results -> not effective + printf "%10d %10d %10d %s\n", + $rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham}, + $rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham}, $test; } } -sub tcr { - my ($nspam, $nlegit, $nspamspam, $nlegitspam) = @_; - my $nspamlegit = $nspam - $nspamspam; - my $nlegitlegit = $nlegit - $nlegitspam; - - my $lambda = 99; - - my $werr = ($lambda * $nlegitspam + $nspamlegit) - / ($lambda * $nlegit + $nspam); - - my $werr_base = $nspam - / ($lambda * $nlegit + $nspam); - - $werr ||= 0.000001; # avoid / by 0 - my $tcr = $werr_base / $werr; - return $tcr; -} Index: masses/perceptron.pod =================================================================== --- masses/perceptron.pod (revision 0) +++ masses/perceptron.pod (revision 0) @@ -0,0 +1,30 @@ +=head1 NAME + +perceptron - Generate scores for SpamAssassin using the "Stochastic +Gradient Method" + +=head1 SYNOPSIS + +perceptron [options] + + Options: + -p ham_preference Modifies tendency to prefer false negatives over + false positives (default 2.0) (higher = less fp) + -e num_epochs Set number of passes to make (default 15) + -l learning_rate Modifies learning rate (default 2.0) + -w weight_decay Scores multiplied by this value after each pass + to prevent scores from getting too high + (default off (1.0)) + +=head1 DESCRIPTION + +This algorithm is used to optimize SpamAssassin scores, based on the +input given by B. At the time of writing, the output of +logs-to-c needs to be compiled into the source before perceptron can +be used, but this will be fixed soon, I hope. + +=head1 SEE ALSO + +L + +=cut Index: masses/rewrite-cf-with-new-scores =================================================================== --- masses/rewrite-cf-with-new-scores (revision 20231) +++ masses/rewrite-cf-with-new-scores (working copy) @@ -16,32 +16,123 @@ # limitations under the License. # +=head1 NAME + +rewrite-cf-with-new-scores - Rewrite SpamAssassin scores file with new +scores. + +=head1 SYNOPSIS + +rewrite-cf-with-new-scores [options] + + Options + --old-scores=file Read file containing the old SpamAssassin scores + --new-scores=file Read file containing the new SpamAssassin scores + -s,--scoreset n Rewrite scoreset n + --output=file Output rewritten score file to file + -c,--cffile=path Use path as the rules directory + -l,--logfile=file Use file instead of masses.log (for guessing -c) + + Note: these options can be shortened (i.e. --old, --new, --out) as + long as they are unambiguous. + +=head1 DESCRIPTION + +B is a tool to update the sitewide scores +file with the newly generated scores. Since SpamAssassin has four +different scoresets, which each need to be generated separately, this +tool is used to only change the correct scoreset. + +By default, the old scores are read from 50_scores.cf in the rules +directory and the new ones from ./perceptron.scores. The output will +be ./50_scores.cf by default. + +The rules directory needs to be used to make sure scores are given for +the right tests. Rules not found in the rules directory will not be +given scores in the output. + +=head1 BUGS + +Please report bugs to http://bugzilla.spamassassin.org/ + +=head1 SEE ALSO + +L, L, L + +=cut + +use FindBin; +use lib "$FindBin::Bin/../lib"; +use Getopt::Long qw(:config bundling auto_help); +use Mail::SpamAssassin::Masses; +use Pod::Usage; +use strict; +use warnings; + +use vars qw($opt_old $opt_new $opt_scoreset $opt_out $opt_c $opt_l); + +GetOptions("old-scores=s" => \$opt_old, + "new-scores=s" => \$opt_new, + "s|scoreset=i" => \$opt_scoreset, + "output=s" => \$opt_out, + "c|cffile=s@" => \$opt_c, + "l|logfile=s" => \$opt_l); + +$opt_l ||= "masses.log"; +$opt_scoreset = 0 unless defined $opt_scoreset; + my $NUM_SCORESETS = 4; -my ($scoreset,$oldscores,$newscores) = @ARGV; +if (!$opt_c || !scalar(@$opt_c)) { + # Try to read this in from the log, if possible + open IN, $opt_l or die "Can't open $opt_l: $!"; + my $files = 0; # are we in the files section? + while() { + if (!$files) { + if (/^\# SVN revision:/) { + $opt_c = [ "$FindBin::Bin/../rules" ]; + last; + } elsif (/^\# Using configuration:$/) { + $files = 1; + } + } elsif (/^\#\s+(.*)\s*$/) { + push (@$opt_c, $1); + } else { + # All done! + last; + } + } -$scoreset = int($scoreset) if defined $scoreset; -if (!defined $newscores || $scoreset < 0 || $scoreset >= $NUM_SCORESETS ) { - die "usage: rewrite-cf-with-new-scores scoreset oldscores.cf newscores.cf\n"; + if (!defined $opt_c) { + $opt_c = [ "$FindBin::Bin/../rules" ]; + } + + foreach my $file (@$opt_c) { + die "Can't read $file" unless -r $file; + } } -system ("./parse-rules-for-masses -s $scoreset") and die; -if (-e "tmp/rules.pl") { - # Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem - require "./tmp/rules.pl"; +if (!$opt_old) { + $opt_old = $$opt_c[0] . "/50_scores.cf"; } -else { - die "parse-rules-for-masses had no error but no tmp/rules.pl!?!"; -} +$opt_new ||= "50_scores.cf"; + +my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c, + scoreset => $opt_scoreset}); + +$masses->readrules(); +my $rules = $masses->get_rules_hash(); + # now read the generated scores my @gascoreorder = (); +my %oldscores = (); my %gascorelines = (); -open (STDIN, "<$newscores") or die "cannot open $newscores"; +open (STDIN, "<$opt_new") or die "cannot open $opt_new"; while () { /^score\s+(\S+)\s+(-?\d+(?:\.\d+)?)/ or next; my $name = $1; my $score = $2; - next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0); + next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule}); next if ($name =~ /^__/); next if ($name eq '(null)'); # er, oops ;) @@ -49,7 +140,7 @@ push (@gascoreorder, $name); } -open (IN, "<$oldscores") or die "cannot open $oldscores"; +open (IN, "<$opt_old") or die "cannot open $opt_old"; my $out = ''; my $pre = ''; @@ -58,7 +149,7 @@ while () { if (/^\s*score\s+(\S+)\s/) { delete $gascorelines{$1}; - next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0); + next unless (exists ($rules->{$1}) && $rules->{$1}->{issubrule} == 0); } $pre .= $_; /^# Start of generated scores/ and last; @@ -82,10 +173,10 @@ if (/^\s*score\s+\S+/) { my($score,$name,@scores) = split; - next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0); + next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule}); if (defined $gascorelines{$name}) { # Set appropriate scoreset value - $scores[$scoreset] = $gascorelines{$name}; + $scores[$opt_scoreset] = $gascorelines{$name}; # Create new score line $_ = join(" ","score",$name,generate_scores(@scores))."\n"; @@ -96,8 +187,10 @@ } close IN; +open OUT, ">$opt_out" or die "Can't open $opt_out: $!"; + # and output the lot -print $pre, "\n"; +print OUT $pre, "\n"; foreach my $name (@gascoreorder) { $_ = $gascorelines{$name}; next unless (defined ($_)); @@ -107,12 +200,12 @@ @scores = @{$oldscores{$name}} if ( exists $oldscores{$name} ); # Set appropriate scoreset value - $scores[$scoreset] = $_; + $scores[$opt_scoreset] = $_; # Create new score line - print join(" ","score",$name,generate_scores(@scores)),"\n"; + print OUT join(" ","score",$name,generate_scores(@scores)),"\n"; } -print "\n", $out, "\n"; +print OUT "\n", $out, "\n"; sub generate_scores { my (@scores) = @_; Index: masses/mboxget =================================================================== --- masses/mboxget (revision 20231) +++ masses/mboxget (working copy) @@ -1,45 +0,0 @@ -#!/usr/bin/perl -w - -# mboxget - get a message from a mailbox -# -# usage: mboxget [mass-check-mbox-id ...] -# -# <@LICENSE> -# Copyright 2004 Apache Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -use strict; - -my $prog = $0; -$prog =~ s@.*/@@; - -foreach my $where (@ARGV) { - my ($file, $offset) = ($where =~ m/(.*?)(?:\.(\d+))?$/); - open(INPUT, $file) || die("$prog: open $file failed: $!\n"); - if ($offset) { - seek(INPUT, $offset, 0) || die("$prog: seek $offset failed: $!\n"); - } - my $past = 0; - while () { - if ($past) { - last if substr($_,0,5) eq "From "; - } - else { - $past = 1; - } - print $_; - } - close INPUT; -} Index: masses/rule-qa/corpus-nightly =================================================================== --- masses/rule-qa/corpus-nightly (revision 20231) +++ masses/rule-qa/corpus-nightly (working copy) @@ -81,14 +81,13 @@ date > test.end # results name -mv spam.log spam-$net$username.log -mv ham.log ham-$net$username.log +mv masses.log masses-$net$username.log # rsync set +e retry=0 while true; do - if rsync -CPcvuzb --timeout=120 spam-$net$username.log ham-$net$username.log $username@rsync.spamassassin.org::corpus/; then + if rsync -CPcvuzb --timeout=120 masses-$net$username.log $username@rsync.spamassassin.org::corpus/; then break; fi if [ $retry -eq 120 ]; then @@ -99,3 +98,4 @@ sleep 30 done set -e + Index: masses/rule-qa/corpus-hourly =================================================================== --- masses/rule-qa/corpus-hourly (revision 20231) +++ masses/rule-qa/corpus-hourly (working copy) @@ -92,7 +92,7 @@ @files = sort readdir(CORPUS); closedir(CORPUS); - @files = grep { /^(?:spam|ham)-(?:net-)?\w+\.log$/ && -f "$opt{corpus}/$_" && -M _ < 10 } @files; + @files = grep { /^masses-(?:net-)?\w+\.log$/ && -f "$opt{corpus}/$_" && -M _ < 10 } @files; @files = grep { my $time = 0; my $tag = 0; @@ -109,6 +109,7 @@ } $time; } @files; + } sub rename { @@ -158,131 +159,96 @@ next if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/); - my @ham = grep { /^ham/ } @files; - my @spam = grep { /^spam/ } @files; + print STDERR "logs: " . join(' ', @files) . "\n"; - print STDERR "ham: " . join(' ', @ham) . "\n"; - print STDERR "spam: " . join(' ', @spam) . "\n"; - chdir $opt{corpus}; # net vs. local if ($class eq "NET") { - @ham = grep { /-net-/ } @ham; - @spam = grep { /-net-/ } @spam; - print STDERR "ham: " . join(' ', @ham) . "\n"; - print STDERR "spam: " . join(' ', @spam) . "\n"; + @files = grep { /-net-/ } @files; + print STDERR "logs: " . join(' ', @files) . "\n"; } else { # if both net and local exist, use newer - my %spam; - my %ham; - for my $file (@spam) { - $spam{$1}++ if ($file =~ m/-(\w+)\.log$/); + for my $file (@files) { + $logs{$1}++ if ($file =~ m/-(\w+)\.log$/); } - for my $file (@ham) { - $ham{$1}++ if ($file =~ m/-(\w+)\.log$/); - } - while (my ($user, $count) = each %ham) { + while (my ($user, $count) = each %logs) { if ($count > 1) { - my $nightly = "ham-$user.log"; - my $weekly = "ham-net-$user.log"; + my $nightly = "masses-$user.log"; + my $weekly = "masses-net-$user.log"; if ($revision{$nightly} >= $revision{$weekly}) { - @ham = grep { $_ ne $weekly } @ham; + @files = grep { $_ ne $weekly } @files; } else { - @ham = grep { $_ ne $nightly } @ham; + @files = grep { $_ ne $nightly } @files; } } } - while (my ($user, $count) = each %spam) { - if ($count > 1) { - my $nightly = "spam-$user.log"; - my $weekly = "spam-net-$user.log"; - if ($revision{$nightly} >= $revision{$weekly}) { - @spam = grep { $_ ne $weekly } @spam; - } - else { - @spam = grep { $_ ne $nightly } @spam; - } - } - } - print STDERR "ham: " . join(' ', @ham) . "\n"; - print STDERR "spam: " . join(' ', @spam) . "\n"; + print STDERR "logs: " . join(' ', @files) . "\n"; } # age if ($class eq "NET" && $age ne "7day") { - @ham = grep { -M "$_" < 10 } @ham; - @spam = grep { -M "$_" < 10 } @spam; + @files = grep { -M "$_" < 10 } @files; # find most recent CVS revision my $wanted = 0.0; for (@spam, @ham) { $wanted = $revision{$_} if ($revision{$_} > $wanted); } - @spam = grep { $revision{$_} eq $wanted } @spam; - @ham = grep { $revision{$_} eq $wanted } @ham; - print STDERR "ham: " . join(' ', @ham) . "\n"; - print STDERR "spam: " . join(' ', @spam) . "\n"; + @files = grep { $revision{$_} eq $wanted } @files; + + print STDERR "logs: " . join(' ', @files) . "\n"; } elsif ($age =~ /^(?:new|all|age)$/) { - @ham = grep { -M "$_" < -M $opt{tagtime} } @ham; - @spam = grep { -M "$_" < -M $opt{tagtime} } @spam; - @ham = grep { $revision{$_} eq $revision } @ham; - @spam = grep { $revision{$_} eq $revision } @spam; - print STDERR "ham: " . join(' ', @ham) . "\n"; - print STDERR "spam: " . join(' ', @spam) . "\n"; + @files = grep { -M "$_" < -M $opt{tagtime} } @files; + + @files = grep { $revision{$_} eq $revision } @files; + + print STDERR "logs: " . join(' ', @files) . "\n"; } elsif ($age =~ /(\d+)day/) { my $mtime = $1; - @ham = grep { -M "$_" < $mtime } @ham; - @spam = grep { -M "$_" < $mtime } @spam; - print STDERR "ham: " . join(' ', @ham) . "\n"; - print STDERR "spam: " . join(' ', @spam) . "\n"; + @files = grep { -M "$_" < $mtime } @files; + + print STDERR "logs: " . join(' ', @files) . "\n"; } open(OUT, "> $opt{html}/$class.$age"); - print OUT "# ham results used: " . join(" ", @ham) . "\n"; - print OUT "# spam results used: " . join(" ", @spam) . "\n"; - for (@ham) { + print OUT "# results used: " . join(" ", @files) . "\n"; + + for (@files) { print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision; } - for (@spam) { - print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision; - } my $flags = ""; $flags = "-t net -s 1" if $class eq "NET"; $flags = "-M HTML_MESSAGE" if $class eq "HTML"; if ($age eq "all") { - my %spam; - my %ham; + my %logs; my @output; - for my $file (@spam) { - $spam{$1} = $file if ($file =~ m/-(\w+)\.log$/); + for my $file (@files) { + $logs{$1} = $file if ($file =~ m/-(\w+)\.log$/); } - for my $file (@ham) { - $ham{$1} = $file if ($file =~ m/-(\w+)\.log$/); - } - unlink "$opt{tmp}/ham.log.$$"; - unlink "$opt{tmp}/spam.log.$$"; - next unless (scalar keys %spam && scalar keys %ham); - for my $user (sort keys %spam) { - next unless defined $ham{$user}; + + unlink "$opt{tmp}/masses.log.$$"; + + next unless (scalar keys %logs); + for my $user (sort keys %logs) { + chdir "$opt{tree}/masses"; - system("cat $opt{corpus}/$ham{$user} >> $opt{tmp}/ham.log.$$"); - system("cat $opt{corpus}/$spam{$user} >> $opt{tmp}/spam.log.$$"); - open(IN, "./hit-frequencies -xpa $flags $opt{corpus}/$spam{$user} $opt{corpus}/$ham{$user} |"); + system("cat $opt{corpus}/$logs{$user} >> $opt{tmp}/masses.log.$$"); + open(IN, "./hit-frequencies -xpa $flags -l $opt{corpus}/$logs{$user} |"); while() { chomp; push @output, "$_:$user\n"; } close(IN); } - open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |"); + open(IN, "./hit-frequencies -xpa $flags -l $opt{tmp}/masses.log.$$ |"); while() { push @output, $_; } @@ -298,21 +264,20 @@ my ($after, $before) = split(/-/, $which); # get and filter logs chdir $opt{corpus}; - for my $type (("ham", "spam")) { - open(TMP, "> $opt{tmp}/$type.log.$$"); - my @array = ($type eq "ham") ? @ham : @spam; - for my $file (@array) { - open(IN, $file); - while () { - print TMP $_ if time_filter($after, $before); - } - close(IN); - } - close (TMP); + + open(TMP, "> $opt{tmp}/masses.log.$$"); + for my $file (@files) { + open(IN, $file); + while () { + print TMP $_ if time_filter($after, $before); + } + close(IN); } + close (TMP); + # print out by age chdir "$opt{tree}/masses"; - open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |"); + open(IN, "./hit-frequencies -xpa $flags -l $opt{tmp}/masses.log.$$ |"); while() { chomp; push @output, "$_:$which\n"; @@ -323,13 +288,11 @@ print OUT $_; } } - elsif (@ham && @spam) { + elsif (@files) { # get logs - system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$"); - system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$"); - + system("cat " . join(" ", @files) . " > $opt{tmp}/masses.log.$$"); chdir "$opt{tree}/masses"; - open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |"); + open(IN, "./hit-frequencies -xpa $flags -l $opt{tmp}/masses.log.$$ |"); while() { print(OUT); } Index: masses/README.user =================================================================== --- masses/README.user (revision 0) +++ masses/README.user (revision 0) @@ -0,0 +1,375 @@ + +HOW TO GENERATE YOUR OWN SCORES FOR SPAMASSASSN +----------------------------------------------- + +Duncan Findlay + + + +1. Introduction + +One of the reasons SpamAssassin is so accurate is that it's scores are +carefully optimized based on collections (aka. corpus, plural: +corpora) of mail from volunteers all across the world. Each volunteer +uses a script ("mass-check") to run SpamAssassin over each piece of +mail in their corpus. They then submit the results to a central server +where the SpamAssassin development team runs the scoring mechanism to +generate optimal scores. + +SpamAssassin uses four different scoresets depending on the options +used. These are almost always referred to by number, as shown below: + +Scoreset + 0 - Network tests disabled, Bayes disabled + 1 - Network tests enabled, Bayes disabled + 2 - Network tests disabled, Bayes enabled + 3 - Network tests enabled, Bayes enabled + +Things are further complicated by the fact that when Bayes is enabled, +it automatically learns using the equivalent scoreset with Bayes +disabled. As a result, optimal scores for scoresets 2 and 3 can only +be generated after scoresets 0 and 1. Set 0 logs can be generated from +set 1 logs, but sets 2 and 3 need to be done separately. + +As a result, volunteers who take part in our rescoring survey need to +run 3 mass-checks, each of which can take many hours. Since the +generation of scores is such a labourious process, the SpamAssassin +developers only perform this once per release. + +Luckily, the previous score optimizer, a Genetic Algorithm, which took +almost 24 hours to optimize scores for one scoreset has been replaced +with the Perceptron (thanks to Henry Stern) which uses a "Stochastic +Gradient Descent" method. Don't worry if you don't understand what +this means, I certainly don't. The Perceptron takes less than 15 +seconds to generate scores of roughly equal quality as the GA. + + +2. Compiling a Corpus + +The first step to generating your own scores it to start collecing +mail, both ham (non-spam) and spam. These should be representative of +all the mail you receive, but you should filter out spam related +lists, like spamassassin-users to avoid skewing results. It is +essential that these corpora be very well classified. It will greatly +reduce the effectiveness of your scores if spam mails get misfiled +into your ham folder and vice versa. + +Also, it is important to note that SpamAssassin is not designed to be +a virus filter, so it's best if you filter out viurses from your ham +and spam folders too. + +Furthermore, since spam and ham characteristics change over time, it's +best to leave out mail over 6 months. This is especially important for +network tests, since these are designed to stop current spam, and are +not historical records. + +I'm not entirely sure how big corpora should be. The bigger, the +better. If your corpus is too small, it may not be sufficiently +representative of all the mail you receive, and accuracy will +suffer. My corpus of mail for the last 6 months is over 55000 messages +(35000 spam, 20000 ham). + + +3. Mass-check + +Now that you've assembled your corpora, you need to use mass-check to +test each message with SpamAssassin. This script is surprisingly fast, +as it accesses the internal perl libraries of SpamAssassin, without +the need to load a new perl process each time (as you would if you +piped each message through spamassassin). Doing a scoreset 2 run (no +network, bayes enabled) I get roughly 10,000 messages an hour on an +unloaded Pentium 4, 2.80Ghz computer with 512 MB RAM. + +By default, if you are not running out of an unpacked source tree, +mass-check will read rules from the usual locations. As a result, you +should make sure ~/.spamassassin/user_prefs contains no rules, unless +you are planning on using your generated scores for only yourself, not +sitewide. + +The first step is to define the locations of all of the messages in +your corpora (these are known as "targets"). I find it's easiest to +put this in a separate file with line of the following format: + +class:format:location + +Class is either "spam" or "ham", format is "mbox", "file", "dir" or +"mbx" and location is the path to the mailbox. mass-check supports +using * as a wildcard, so the following target is permitted: + +spam:mbox:/home/duncf/Maildir/Old/spam/* + +Once you have placed all the "targets" necessary for your corpora, run +mass-check with the following command. + +mass-check -f file + +If you doing a mass-check run for scoreset 1 or 3 (i.e. network tests +enabled) you will also need to add the --net option, and you will want +to add -j8 (or some other number) to indicate how many messages to +test in parallel. This is useful since a lot of time would otherwise +be spent waiting for network queries to return. + +mass-check will generate a log file in the current directory entitled +masses.log. This is the log file that will enable us to optimize +scores. + +For the impatient: if you're one of those people who want to know +exactly how far mass-check has gotten through your mail, use the +--showdots option. + + +4. Checking the quality of your corpora (a.k.a. Pulling Weeds) + +In order to ensure that your corpora don't contain misfiled mails, it +is good to double check the highest scoring hams and lowest scoring +spams. + +First check ham mail: + +grep "^h" masses.log | sort -rn -k2,2 | head -20 + +If you want to read the corresponding messages try piping to +extract-message-from-mbox -m (see the extract-message-from-mbox +section for more detail). + +Do the same with spam mail: + +grep "^s" masses.log | sort -n -k2,2 | head -20 + + +5. extract-message-from-mbox + +extract-message-from-mbox takes a mbox filename and a byte offset and +outputs the corresponding mail message. With the -m option, mass-check +output (i.e. lines from masses.log) is read from the standard +input. Without, arguments are expected to be in the form +. (i.e. /path/to/mbox.12345) + +The -h option can also be used to only show message headers. + +As shown above, it is quite useful to pipe portions of masses.log to +extract-message-from-mbox. + + +6. hit-frequencies + +hit-frequencies doesn't really help you advance toward your goal of +optimizing scores, but it is very useful in evaluating locally created +rules. Run it, look at it's output; you'll find it intersting (and if +not, feel free to skip to the next section). + +hit-frequencies -x -p -s + +hit-frequencies (and many other scripts) are set to automatically +guess where to find your configuration files based on +masses.log. Unfortunately, it isn't perfect (actually it's a rather +crude hack, but that's irrelevant). You may have to check masses.log +to figure out where it's searching and/or add --cffile options (you +can specify multiple paths using multiple --cffile options). + +hit-frequencies -x -p generates the following output: + +OVERALL% SPAM% HAM% S/O RANK SCORE NAME + 64008 40932 23076 0.639 0.00 0.00 (all messages) +100.000 63.9483 36.0517 0.639 0.00 0.00 (all messages as %) + 10.382 16.2342 0.0000 1.000 1.00 3.10 FORGED_MUA_OUTLOOK + 8.266 12.9263 0.0000 1.000 0.99 1.00 FORGED_OUTLOOK_TAGS + 6.484 10.1388 0.0000 1.000 0.98 4.50 DRUGS_ERECTILE_OBFU +[...] + +The first two rows show the size of the corpora and their ham/spam +break down. The following lines list each rule found and give various +statistics about it based on your masses.log. + +OVERALL% represents the percentage of total messages (spam and ham) +that the rule hits, SPAM% and HAM% show the percentages on each +corpus. S/O is the SPAM% divided by the OVERALL%. Generally good +(non-nice) rules have S/O's over 0.95, while nice (negative-scoring) +rules generally have S/O's less than 0.5. RANK is a human readable +indicator of how good a rule is. The higher the better, always. RANK +is designed to be a rough indicator of the score the perceptron is +likely to give it. SCORE is simply the current score. (This is simply +listed for convenience, not calculated in any way.) + +If you do any rule development locally, you will find this is a great +tool. If you come up with some great rules (that we haven't already +thought of), please send us a patch at +http://bugzilla.spamassassin.org/. + + +7. lint-rules-from-freqs + +This script is designed to read in your masses.log and the +SpamAssassin configuration files in order to find both bad syntax and +bad rules that hit few messages or (with -f) have too many false +positives/negatives, etc. + +lint-rules-from-freqs -f -s + +As with hit-frequencies, it tries to be smart with choosing the right +--cffile options. + +This script is roughly the equivalent of running a spamassassin --lint +and running a hit-frequencies to determine which tests have bad S/O +ratios. + + +8. logs-to-c + +logs-to-c is the program that converts a mass-check log into code that +can be easily used by the perceptron. Currently, it is necessary to +use the output of logs-to-c to even compile perceptron, but that +should hopefully change in the near future. + +The files logs-to-c create need to be in the tmp/ sub-directory of the +directory where perceptron.c is. + +logs-to-c -o tmp/ -s + +These files contain information about each rule such as whether or not +the perceptron is permitted to change the rule's score, the range +within which the perceptron can adjust it, whether or not a rule is +nice, etc. In addition, these files contain information about each +mail hit and which tests were hit. The files generated by logs-to-c +are not really easy to read, so don't try; use hit-frequencies +instead. + + +9. perceptron + +perceptron is the brains behind the whole process. (And we must of +course thank the brain behind perceptron, Henry Stern, for his +contribution.) + +While the perceptron takes options for things such as "ham +preference", "number of epochs", "learning rate" and "weight decay", +it's probably best to trust the defaults; unless of course you want to +try to find the optimum parameters (and post them to +http://bugzilla.spamassassin.org/ with your evidence). + +The perceptron is incredibly quick. So start it, wait 15 seconds and +voila, your optimized scores are ready. The output is in +perceptron.scores. + +Unfortunately, it needs to be built from source every time you want to +use it with a different masses.log or set of rules. In the directory +containing perceptron.c, try: + +make perceptron +./perceptron + +If you don't have the Makefile, try +gcc -g -O2 -Wall -o perceptron perceptron.c -lm +./perceptron + + +10. rewrite-cf-with-new-scores + +perceptron dumps its results in perceptron.scores. Great. How does +that help you? rewrite-cf-with-new-scores takes care of changing the +old configuration files to correspons with the new scores. The script +takes into account rules found in your configuration, so make sure +that the --cffile argument is right (it'll read this from masses.log +by default). The syntax is: + +rewrite-cf-with-new-scores --old 50_scores.cf --new perceptron.scores \ + --out 50_scores.new.cf -l masses.log -s 2 + +Make sure you don't forget the -s option. You need to tell it which +scoreset to update or it'll update set 0, which is not what you want +(unless you just did a set 0 run, of course). + +Note: the statistics in the new scores file are NOT updated. Just the +scores are. + +11. fp-fn-statistics + +This script calculates how good the scores are ata given threshold. It +returns the number of false positives, false negatives, true +positives, true negatives and a whole variety of fun statistics. + +./fp-fn-statistics -s --cffile + +fp-fn-statistics also generates a TCR which is essentially an overall +rating of how good the scores are. (This is only accurate when run on +a different corpus of mail than that with which the scores were +generated). TCR stands for "Total Cost Ratio". The higher the number, +the better the set of scores. + + +12. Submitting corpora for SpamAssassin + +If you want to contribute your mass-check logs to the SpamAssassin +rescoring process, please download the latest revision of SpamAssassin +from the subversion repository. See this page of the wiki: +http://wiki.spamassassin.org/DownloadFromSvn + +You will want to read CORPUS_POLICY and CORPUS_SUBMIT. We only do +large rescoring runs just before releases, so be sure to follow the +lists which will have more information and reminders on how to +participate. + +Please be sure your corpora are of high quality (everything must be +carefully checked to avoid misfilings). Also, we appreciate varied +sources of mail. + + +13. Other scripts + +Only a subset of the scripts used in rule development and scoring have +been documented here. Most of the others aren't really very +useful. You can examine the others by downloading the source from the +subversion repository: http://wiki.spamassassin.org/DownloadFromSvn. +Everything relating to rule QA and development is in the masses/ +sub-directory. + +The scripts presented here have had man pages written for them, and an +attempt has been made to standardize the options for ease of use. Many +of the others may require some reading of source to understand how +they work and what they do. + + +14. Frequently Asked Questions + +(Since this is the first version of this document, I'm guessing what +questions would otherwise be asked. So this isn't really a "Frequently +Asked Questions" list, but a "What did Duncan fail to address +elsewhere?" list.) + +Q. Why don't the scripts automatically guess which scoreset to use like +they do with --cffile? + +A. Firstly, mass-check does not know what scoreset +you are running. It could guess, but it probably shouldn't. Secondly, +the same masses.log can be used for multiple scoresets (a set 1 log +can be used to generate scores for sets 0 and 1, by stripping out net +rules etc.) + +Q. How can I determine how good the scoring system is? + +A. There is a series of scripts in the source directory (in +masses/tenpass/) designed to determine how accurate the perceptron is +by using "10-fold Cross Validation" (10fcv). Basically, the masses.log +is split into 10 "buckets" and each bucket is sequentially used to +validate against scores generated from the remaining 9. + + +15. Bugs, author, improvements, etc. + +SpamAssassin is written and maintained by a group of developers, whose +names can be found in the CREDITS file. + +If you have further questions about SpamAssassin or the rescoring +scripts, try the following: + +- Ask on one of the SpamAssassin mailing lists: + +http://www.spamassassin.org/lists.html + +- If you've found a bug, file a report: + +http://bugzilla.spamassassin.org/ + +- Also, check out our wiki: + +http://wiki.spamassassin.org/ Index: masses/runGA =================================================================== --- masses/runGA (revision 20231) +++ masses/runGA (working copy) @@ -1,47 +1,43 @@ #!/bin/sh SCORESET="0" +if [ "x$1" != "x" ] ; then + SCORESET=$1 +fi + NAME="set$SCORESET" +BASE="logs" -if [ ! -f "ORIG/ham-$NAME.log" -o ! -f "ORIG/spam-$NAME.log" ]; then +if [ ! -f "ORIG/masses-$NAME.log" ]; then echo "Couldn't find logs for $NAME" >&2 exit 1 fi -if [ "x$1" = "x" ]; then +if [ "x$2" = "x" ]; then echo "[Doing a scoreset $SCORESET score-generation run]" # Clean out old runs echo "[Cleaning up]" -rm -rf spam-validate.log nonspam-validate.log ham-validate.log spam.log nonspam.log ham.log NSBASE SPBASE tmp make.output freqs perceptron.scores \ - gen-$NAME.out gen-$NAME.scores gen-$NAME.validate + +rm -rf masses-validate.log masses.log $BASE tmp make.output freqs \ + perceptron.scores gen-$NAME.out gen-$NAME.scores gen-$NAME.validate make clean >/dev/null # Generate 90/10 split logs echo "[Generating 90/10 split ham]" -mkdir NSBASE SPBASE -cd NSBASE -../tenpass/split-log-into-buckets 10 < ../ORIG/ham-$NAME.log > /dev/null -cat split-[1-9].log > nonspam.log +mkdir $BASE +cd $BASE +../tenpass/split-log-into-buckets 10 < ../ORIG/masses-$NAME.log > /dev/null +cat split-[1-9].log > masses.log rm -f split-[1-9].log -mv split-10.log nonspam-validate.log +mv split-10.log masses-validate.log -echo "[Generating 90/10 split spam]" -cd ../SPBASE -../tenpass/split-log-into-buckets 10 < ../ORIG/spam-$NAME.log > /dev/null -cat split-[1-9].log > spam.log -rm -f split-[1-9].log -mv split-10.log spam-validate.log cd .. echo "[Setting up for gen run]" # Ok, setup for a run -ln -s SPBASE/spam.log . -ln -s NSBASE/nonspam.log . -ln -s NSBASE/nonspam.log ham.log -ln -s SPBASE/spam-validate.log . -ln -s NSBASE/nonspam-validate.log . -ln -s NSBASE/nonspam-validate.log ham-validate.log +ln -s $BASE/masses.log . +ln -s $BASE/masses-validate.log . # try to find number of processors numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1` @@ -57,11 +53,12 @@ else +echo "Make sure 50_scores.cf has been replaced appropriately" + # This needs to have 50_scores.cf in place first ... echo "[gen validation results]" -./logs-to-c --spam=SPBASE/spam-validate.log \ - --nonspam=NSBASE/nonspam-validate.log \ - --count --cffile=../rules --scoreset=$SCORESET | tee gen-$NAME.validate +./fp-fn-statistics --logfile=BASE/masses-validate.log \ + --cffile=../rules --scoreset=$SCORESET | tee gen-$NAME.validate echo "[STATISTICS file generation]" ./mk-baseline-results $SCORESET | tee gen-$NAME.statistics Index: masses/lint-rules-from-freqs =================================================================== --- masses/lint-rules-from-freqs (revision 20231) +++ masses/lint-rules-from-freqs (working copy) @@ -16,124 +16,214 @@ # limitations under the License. # +=head1 NAME + +lint-rules-from-freqs - Try to find problems with SpamAssassin rules + +=head1 SYNOPSIS + +lint-rules-from-freqs [options] + + Options: + -c,--cffile=path Use path as the rules directory + -s,--scoreset=n Use scoreset n + -l,--logfile=file Read in file instead of masses.log + -f Also take into account false positives/negatives + +=head1 DESCRIPTION + +This script analyzes SpamAssassin tests, based on the hit frequencies +and S/O ratios from a mass-check log (masses.log). This script can +also optionally take into account the false positive/negative +frequencies. + +The script first uses the SpamAssassin rules parser to report on any +illegal syntax. Then it checks the rules match frequencies from the +mass-check log in order to determine how effective the rule is. + +=head1 BUGS + +Please report bugs to http://bugzilla.spamassassin.org/ + +=head1 SEE ALSO + +L, L, L + +=cut + + +use FindBin; +use lib "$FindBin::Bin/../lib"; +use Mail::SpamAssassin::Masses; +use Mail::SpamAssassin; +use Getopt::Long qw(:config bundling auto_help); +use strict; +use warnings; + # any tests that get less than this % of matches on *both* spam or nonspam, are # reported. my $LOW_MATCHES_PERCENT = 0.03; -my $scoreset = 0; -sub usage { - die " -lint-rules-from-freqs: perform 'lint' testing on SpamAssassin rules and scores +use vars qw($opt_c $opt_l $opt_s $opt_f $opt_p); -usage: ./lint-rules-from-freqs [-f falsefreqs] < freqs > badtests +GetOptions("c|cffile=s@" => \$opt_c, + "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode) + "l|logfile=s" => \$opt_l, + "f|falses" => \$opt_f); -This analyzes SpamAssassin tests, based on the hit frequencies and S/O ratios -from a mass-check logfile pair. -The 'freqs' argument is the frequency of hits in all messages ('hit-frequencies --x -p' output). +$opt_s = 0 unless defined $opt_s; +$opt_l ||= "masses.log"; -The 'falsefreqs' argument is frequencies of hits in false-positives and -false-negatives only ('hit-frequencies -x -p -f' output). +if (!$opt_c || !scalar(@$opt_c)) { + # Try to read this in from the log, if possible + open IN, $opt_l or die "Can't open $opt_l: $!"; + my $files = 0; # are we in the files section? + while() { + if (!$files) { + if (/^\# SVN revision:/) { + $opt_c = [ "$FindBin::Bin/../rules" ]; + last; + } elsif (/^\# Using configuration:$/) { + $files = 1; + } + } elsif (/^\#\s+(.*)\s*$/) { + push (@$opt_c, $1); + } else { + # All done! + last; + } + } -"; -} + if (!defined $opt_c) { + $opt_c = [ "$FindBin::Bin/../rules" ]; + } -my $opt_falsefreqs; -while ($#ARGV >= 0) { - $_ = shift @ARGV; - if (/^-f/) { $_ = shift @ARGV; $opt_falsefreqs = $_; } - elsif (/^-s/) { $_ = shift @ARGV; $scoreset = $_; } - else { usage(); } + foreach my $file (@$opt_c) { + die "Can't read $file" unless -r $file; + } } print "BAD TESTS REPORT\n"; -readrules(); -print "\n" .((scalar keys %rulefile) + 1). " rules found.\n"; +# First, do a --lint + print "\nRule file syntax issues:\n\n"; -lintrules(); -if ($opt_falsefreqs) { - open (FALSE, "<$opt_falsefreqs"); - while () { - if (!/^\s*([\d\.]+)/) { - my ($overall, $spam, $nons, $so, $score, $name) = split (' '); - next unless ($name =~ /\S/); - $falsefreqs_spam{$name} = $spam; - $falsefreqs_nons{$name} = $nons; - $falsefreqs_so{$name} = $so; +{ + local (*STDERR) = \*STDOUT; # Get lint errors on STDOUT + + # Read the config ourselves... + + # Read init.pre from each directory, then glob for the rest. + + my $cf_txt = ''; + my @files; + my @dirs; + foreach my $file (@$opt_c) { + if (-d $file) { + if (-r "$file/init.pre") { + push @files, "$file/init.pre"; + } + push @dirs, $file; } + else { + push @files, $file; + } } - close FALSE; -} + foreach my $dir (@dirs) { + my @cfs = glob("$dir/*.cf"); + push @files, grep { -r $_ } @cfs; + } -while (<>) { - if (!/^\s*([\d\.]+)/) { - $output{'a_header'} = $_; next; + foreach my $file (@files) { + if (-r $file) { + open IN, $file; + $cf_txt .= "file start $file\n"; + $cf_txt .= join('', ); + $cf_txt .= "\nfile end $file\n"; + close IN; + } } + my $spamtest = new Mail::SpamAssassin({config_text => $cf_txt}); + + $spamtest->lint_rules(); +} + + +# Next, check for other stuff +my $masses = Mail::SpamAssassin::Masses->new({rulesdir => $opt_c, + scoreset => $opt_s, #,, + falses => $opt_f, + logfile => $opt_l}); + +$masses->readlogs(); +$masses->do_statistics(); + +my $rules = $masses->get_rules_array(); + + +my %output; + +foreach my $rule (@$rules) { + my $badrule; - my ($overall, $spam, $nons, $so, $score, $name) = split (' '); - next unless ($name =~ /\S/); - my $ffspam = $falsefreqs_spam{$name}; - my $ffnons = $falsefreqs_nons{$name}; - my $ffso = $falsefreqs_so{$name}; + next if ($rule->{tflags} =~ /\bnet\b/ && ($opt_s % 2) == 0); + next if ($rule->{tflags} =~ /\buserconf\b/); - my $tf = $tflags{$name}; - next if ($tf =~ /net/ && ($scoreset % 2) == 0); - next if ($tf =~ /userconf/); + if ($rule->{freq_spam} == 0 && $rule->{freq_ham} == 0) { # sanity! - if ($overall == 0.0 && $spam == 0.0 && $nons == 0.0) { # sanity! $badrule = 'no matches'; } else { - if ($score < 0.0) { + if ($rule->{score} < 0.0) { # negative score with more spams than nonspams? bad rule. - if ($tf !~ /nice/ && $so > 0.5 && $score < 0.5) { + if (!$rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) { $badrule = 'non-nice but -ve score'; } - - if ($tf =~ /nice/ && $so > 0.5 && $score < 0.5) { - if ($ffso < 0.5) { + if ($rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) { + if ($opt_f && $rule->{freq_fn} < $rule->{freq_fp}) { $badrule = 'fn'; - } else { - # ignore, the FNs are overridden by other tests so it doesn't - # affect the overall results. } + # else { + # ignore, the FNs are overridden by other tests so it doesn't + # affect the overall results. + # } } # low number of matches overall - if ($nons < $LOW_MATCHES_PERCENT) + if ($rule->{ham_percent} < $LOW_MATCHES_PERCENT) { $badrule ||= ''; $badrule .= ', low matches'; } - } elsif ($score > 0.0) { + } elsif ($rule->{score} > 0.0) { # positive score with more nonspams than spams? bad. - if ($tf =~ /nice/ && $so < 0.5 && $score > 0.5) { + if ($rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) { $badrule = 'nice but +ve score'; } - - if ($tf !~ /nice/ && $so < 0.5 && $score > 0.5) { - if ($ffso > 0.5) { + + if (!$rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) { + if ($opt_f && $rule->{freq_fp} > $rule->{freq_fn}) { $badrule = 'fp'; - } else { - # ignore, the FPs are overridden by other tests so it doesn't - # affect the overall results. } + # else { + # ignore, the FPs are overridden by other tests so it doesn't + # affect the overall results. + # } } - + # low number of matches overall - if ($spam < $LOW_MATCHES_PERCENT) + if ($rule->{spam_percent} < $LOW_MATCHES_PERCENT) { $badrule ||= ''; $badrule .= ', low matches'; } - - } elsif ($score == 0.0) { + + } elsif ($rule->{score} == 0.0) { $badrule = 'score is 0'; } } - + if (defined $badrule) { - $badrule =~ s/^, //; chomp; - $output{$badrule} .= $_ . " ($badrule)\n"; + $badrule =~ s/^, //; + $output{$badrule} .= $rule->{name} . " ($badrule)\n"; } } @@ -156,182 +246,3 @@ exit; -sub concat_rule_lang { - my $rule = shift; - my $lang = shift; - - if (defined $lang && $lang ne '') { - return "[$lang]_$rule"; - } else { - return $rule; - } -} - -# note: do not use parse-rules-for-masses here, we need to do linting instead -# of your average parse -sub readrules { - my @files = <../rules/[0-9]*.cf>; - my $file; - %rulesfound = (); - %langs = (); - foreach $file (@files) { - open (IN, "<$file"); - while () { - s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/; - - # make all the foo-bar stuff foo_bar - 1 while s/^(\S+)-/\1_/g; - 1 while s/^(lang\s+\S+\s+\S+)-/\1_/g; - - my $lang = ''; - if (s/^lang\s+(\S+)\s+//) { - $lang = $1; $langs{$1} = 1; - } - - if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) { - $rulesfound{$2} = 1; - $rulefile{$2} ||= $file; - $scorefile{$1} = $file; - $score{$2} ||= 1.0; - $tflags{$2} ||= ''; - $descfile{$2} ||= $file; # a rule with no score or desc is OK - $description{$2}->{$lang} = undef; - - if (/^body\s+\S+\s+eval:/) { - # ignored - } elsif (/^body\s+\S+\s+(.*)$/) { - my $re = $1; - - # If there's a ( in a rule where it should be (?:, flag it. - # but ignore [abc(] ... - if ($re =~ /[^\\]\([^\?]/ && $re !~ /\[[^\]]*[^\\]\(/) { - print "warning: non-(?:...) capture in regexp in $file: $_\n"; - } - if ($re =~ /\.[\*\+]/) { - print "warning: .* in regexp in $file: $_\n"; - } - if ($re =~ /[^\\]\{(\d*),?(\d*?)\}/) { - if ($1 > 120 || $2 > 120) { - print "warning: long .{n} in regexp in $file: $_\n"; - } - } - } - - } elsif (/^describe\s+(\S+)\s+(.*?)\s*$/) { - $rulesfound{$1} = 1; - $descfile{concat_rule_lang ($1, $lang)} ||= $file; - $descfile{$1} ||= $file; - $description{$1}->{$lang} = $2; - } elsif (/^tflags\s+(\S+)\s+(.+)$/) { - $rulesfound{$1} = 1; - $tflags{$1} = $2; - $tflagsfile{concat_rule_lang ($1, $lang)} = $file; - $tflagsfile{$1} = $file; - } elsif (/^score\s+(\S+)\s+(.+)$/) { - $rulesfound{$1} = 1; - $scorefile{concat_rule_lang ($1, $lang)} = $file; - $scorefile{$1} = $file; - $score{$1} = $2; - } elsif (/^(clear_report_template|clear_spamtrap_template|report|spamtrap| - clear_terse_report_template|terse_report| - required_score|ok_locales|ok_languages|test|lang| - spamphrase|whitelist_from|require_version| - clear_unsafe_report_template|unsafe_report| - (?:bayes_)?auto_learn_threshold_nonspam|(?:bayes_)?auto_learn_threshold_spam| - (?:bayes_)?auto_learn - )/x) { - next; - } else { - print "warning: unknown rule in $file: $_\n"; - } - } - close IN; - } - @langsfound = sort keys %langs; - @rulesfound = sort keys %rulesfound; -} - -sub lintrules { - my %possible_renames = (); - - foreach my $rule (@rulesfound) { - my $match = $rule; - $match =~ s/_\d+[^_]+$//gs; # trim e.g. "_20K" - $match =~ s/[^A-Z]+//gs; # trim numbers etc. - - if (defined ($rulefile{$rule}) && $possible_renames{$match} !~ / \Q$rule\E\b/) { - $possible_renames{$match} .= " ".$rule; - } - $possible_rename_matches{$rule} = $match; - } - - foreach my $lang ('', @langsfound) { - foreach my $baserule (@rulesfound) { - next if ( $baserule =~ /^__/ || $baserule =~ /^T_/ ); - - my $rule = concat_rule_lang ($baserule, $lang); - my $f = $descfile{$rule}; - my $warned = ''; - - if (defined $f && !defined ($rulefile{$rule}) - && !defined ($rulefile{$baserule})) - { - print "warning: $baserule has description, but no rule: $f\n"; - $warned .= ' lamedesc'; - } - - # Check our convention for rule length - if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && length $baserule > 22 ) { - print "warning: $baserule has a name longer than 22 chars: $f\n"; - } - # Check our convention for rule length - if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && defined $description{$baserule}->{$lang} && length $description{$baserule}->{$lang} > 50 ) { - print "warning: $baserule has a description longer than 50 chars: $f\n"; - } - - # lang rule trumps normal rule - $f = $rulefile{$rule} || $rulefile{$baserule}; - # if the rule exists, and the language/rule description doesn't exist ... - if ( defined $f && !defined $description{$baserule}->{$lang} ) - { - print "warning: $baserule exists, ",( $lang ne '' ? "lang $lang, " : "" ),"but has no description: $f\n"; - $warned .= ' lamedesc'; - } - - - $f = $scorefile{$rule}; - if (defined $f && !defined ($rulefile{$rule}) - && !defined ($rulefile{$baserule})) - { - print "warning: $baserule has score, but no rule: $f\n"; - $warned .= ' lamescore'; - } - - my $r = $possible_rename_matches{$rule}; - if ($warned ne '' && defined $r) { - my @matches = split (' ', $possible_renames{$r}); - if (scalar @matches != 0) { - my $text = ''; - - # now try and figure out "nearby" rules with no description/score - foreach my $baser (@matches) { - my $blang; - if ($descfile{$rule} =~ /text_(\S\S)\./) { - $blang = $1; - } - my $r = concat_rule_lang ($baser, $blang); - #warn "$r $descfile{$r} $descfile{$baser}"; - next if ($warned =~ /lamedesc/ && (defined $descfile{$r})); - next if ($warned =~ /lamescore/ && (defined $scorefile{$r})); - $text .= " $baser"; - } - - if ($text ne '') { - print "warning: (possible renamed rule? $text)\n"; - } - } - } - } - } -} - Index: masses/Makefile =================================================================== --- masses/Makefile (revision 20231) +++ masses/Makefile (working copy) @@ -3,34 +3,27 @@ LDFLAGS= -lm # What rule scoreset are we using? -SCORESET = 0 +SCORESET = 3 +LOGFILE = masses.log #### Should be no need to modify below this line all: badrules perceptron perceptron: perceptron.o - $(CC) -o perceptron perceptron.o $(LDFLAGS) + $(CC) -o perceptron perceptron.o $(LDFLAGS) -perceptron.o: tmp/rules.pl tmp/tests.h tmp/scores.h +perceptron.o: tmp/tests.h $(CC) $(CFLAGS) -c -o perceptron.o perceptron.c -tmp/rules.pl: tmp/.created parse-rules-for-masses - perl parse-rules-for-masses -d ../rules -s $(SCORESET) +tmp/tests.h: tmp/.created logs-to-c + perl logs-to-c --scoreset=$(SCORESET) --logfile=$(LOGFILE) -tmp/tests.h: tmp/.created tmp/ranges.data logs-to-c - perl logs-to-c --scoreset=$(SCORESET) +freqs: masses.log + perl hit-frequencies -x -p -s $(SCORESET) --logfile=$(LOGFILE) > freqs -tmp/scores.h: tmp/tests.h - -tmp/ranges.data: tmp/.created freqs score-ranges-from-freqs - perl score-ranges-from-freqs ../rules $(SCORESET) < freqs - -freqs: spam.log ham.log - perl hit-frequencies -x -p -s $(SCORESET) > freqs - badrules: freqs - perl lint-rules-from-freqs < freqs > badrules + perl lint-rules-from-freqs -s $(SCORESET) --logfile=$(LOGFILE) > badrules tmp/.created: -mkdir tmp Index: masses/mass-check =================================================================== --- masses/mass-check (revision 20231) +++ masses/mass-check (working copy) @@ -16,144 +16,251 @@ # limitations under the License. # -sub usage { - die < - -j=jobs specify the number of processes to run simultaneously - --net turn on network checks! - --mid report Message-ID from each message - --debug report debugging information - --progress show progress updates during check - --rewrite=OUT save rewritten message to OUT (default is /tmp/out) - --showdots print a dot for each scanned message - --rules=RE Only test rules matching the given regexp RE - --restart=N restart all of the children after processing N messages - --deencap=RE Extract SpamAssassin-encapsulated spam mails only if they - were encapsulated by servers matching the regexp RE - (default = extract all SpamAssassin-encapsulated mails) - - log options - -o write all logs to stdout - --loghits log the text hit for patterns (useful for debugging) - --loguris log the URIs found - --hamlog=log use as ham log ('ham.log' is default) - --spamlog=log use as spam log ('spam.log' is default) - - message selection options - -n no date sorting or spam/ham interleaving - --after=N only test mails received after time_t N (negative values - are an offset from current time, e.g. -86400 = last day) - or after date as parsed by Time::ParseDate (e.g. '-6 months') - --before=N same as --after, except received times are before time_t N - --all don't skip big messages - --head=N only check first N ham and N spam (N messages if -n used) - --tail=N only check last N ham and N spam (N messages if -n used) - - simple target options (implies -o and no ham/spam classification) - --dir subsequent targets are directories - --file subsequent targets are files in RFC 822 format - --mbox subsequent targets are mbox files - --mbx subsequent targets are mbx files - - Just left over functions we should remove at some point: - --bayes report score from Bayesian classifier - - non-option arguments are used as target names (mail files and folders), - the target format is: :: - is "spam" or "ham" - is "dir", "file", "mbx", or "mbox" - is a file or directory name. globbing of ~ and * is supported +=head1 NAME -ENDOFUSAGE -} +mass-check - Generates SpamAssassin scores and results for large +amounts of mail +=head1 SYNOPSIS + + mass-check [options] class:format:location ... + mass-check [options] {--dir | --file | --mbox} target ... + mass-check [options] -f file + + Options: + -f=file read list of targets from + -j=jobs specify the number of processes to run simultaneously + --net turn on network checks! + --mid report Message-ID from each message + --debug report debugging information + --progress show progress updates during check + --rewrite=OUT save rewritten message to OUT (default is /tmp/out) + --showdots print a dot for each scanned message + --rules=RE Only test rules matching the given regexp RE + --restart=N restart all of the children after processing N messages + + SpamAssassin options + -c=dir set configuration/rules directory + -p=file set user preferences file (default: none) + -s=dir set site rules configuration directory + -u=dir set user-state directory + --dist assumes the script is being run from the masses/ dir of + the unpacked tarball, and makes appropriate guesses for + -p and -c + --deencap=RE Extract SpamAssassin-encapsulated spam mails only if they + were encapsulated by servers matching the regexp RE + (default = extract all SpamAssassin-encapsulated mails) + + log options + -o write all logs to stdout + --loghits log the text hit for patterns (useful for debugging) + --loguris log the URIs found + --log=file log to (masses.log is default) + + message selection options + -n no date sorting or spam/ham interleaving + --after=N only test mails received after time_t N (negative values + are an offset from current time, e.g. -86400 = last day) + or after date as parsed by Time::ParseDate (e.g. '-6 months') + --before=N same as --after, except received times are before time_t N + --all don't skip big messages + --head=N only check first N ham and N spam (N messages if -n used) + --tail=N only check last N ham and N spam (N messages if -n used) + + simple target options (implies -o and no ham/spam classification) + --dir subsequent targets are directories + --file subsequent targets are files in RFC 822 format + --mbox subsequent targets are mbox files + --mbx subsequent targets are mbx files + + Just left over functions we should remove at some point: + --bayes report score from Bayesian classifier + --hamlog=log use as ham log ('ham.log' is default) + --spamlog=log use as spam log ('spam.log' is default) + +=head1 DESCRIPTION + +B is designed to assist with rule development and +generation of SpamAssassin scored. It reads in mail from the +location(s) specified on the command line (in the first form above), +given in the form I, where I is either +"spam" or "ham" (non-spam), I is one of "dir" (Maildirs, MH, +etc), "file", "mbox" (mboxes can be gzipped) or "mbx". + +B will analyze each message using SpamAssassin and +generate one-line of output per message, (by default to masses.log) in +the following format: + + {s|h} {s|h} score filename tests-hit + +The first field is the message's class as given on the command line +(ham or spam). The second is the message's class as determined by +SpamAssassin. The third is the message's score, as determined by +SpamAssassin. The fourth field contains the message's filename; for +mboxes, this contains the filename and the byte offset from the +beginning of the file separated by a period. The last field contains a +list of all the tests the message hit separated by commas. + +If you want to run this on the currently installed version of +SpamAssassin's rules for sitewide use, make sure your user_prefs file +contains no rules. + +=head1 BUGS + +Please report bugs to http://bugzilla.spamassassin.org/ + +=head1 SEE ALSO + +L, L, L, +L + +=cut + ########################################################################### -use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all $opt_bayes - $opt_debug $opt_format $opt_hamlog $opt_head $opt_loghits - $opt_mid $opt_mh $opt_ms $opt_net $opt_nosort $opt_progress - $opt_showdots $opt_spamlog $opt_tail $opt_rules $opt_restart - $opt_loguris $opt_after $opt_before $opt_rewrite $opt_deencap); +use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all + $opt_bayes $opt_before $opt_debug $opt_dist $opt_format + $opt_hamlog $opt_head $opt_log $opt_loghits $opt_mid + $opt_mh $opt_ms $opt_net $opt_nosort $opt_p $opt_progress + $opt_s $opt_showdots $opt_spamlog $opt_tail $opt_rules + $opt_restart $opt_loguris $opt_after $opt_rewrite $opt_u + $opt_deencap); use FindBin; use lib "$FindBin::Bin/../lib"; eval "use bytes"; use Mail::SpamAssassin::ArchiveIterator; use Mail::SpamAssassin; -use Getopt::Long; +use Getopt::Long qw(:config bundling auto_help); +use Pod::Usage; use POSIX qw(strftime); use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; }; +use strict; # Why wasn't this on? use Config; # default settings -$opt_c = "$FindBin::Bin/../rules"; -$opt_p = "$FindBin::Bin/spamassassin"; + $opt_j = 1; $opt_net = 0; -$opt_hamlog = "ham.log"; -$opt_spamlog = "spam.log"; +$opt_log = "masses.log"; -GetOptions("c=s", "p=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug", - "hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net", - "progress", "rewrite:s", "showdots", "spamlog=s", "tail=i", - "rules=s", "restart=i", "after=s", "before=s", "loguris", "deencap=s", +GetOptions("c|cffile=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug", + "deencap=s", "dist!", "hamlog=s", "head=i", "log=s", + "loghits", "mh", "mid", "ms", "net", "p=s", "progress", + "rewrite:s", "s=s", "showdots", "spamlog=s", "tail=i", + "rules=s", "restart=i", "u=s", "after=s", "loguris", "dir" => sub { $opt_format = "dir"; }, - "file" => sub { $opt_format = "file"; }, + "file" => sub {$opt_format = "file"; }, "mbox" => sub { $opt_format = "mbox"; }, "mbx" => sub { $opt_format = "mbx"; }, - '<>' => \&target) or usage(); + '<>' => \&target); +if ($opt_hamlog || $opt_spamlog) { # Old style logging + $opt_hamlog ||= "ham.log"; + $opt_spamlog ||= "spam.log"; +} + +my @targets; + if ($opt_f) { open(F, $opt_f) || die $!; push(@targets, map { chomp; $_ } ); close(F); } -if (scalar @targets == 0) { usage(); } +if (scalar @targets == 0) { pod2usage("No target defined!"); } -#if ($opt_ms) { -#find_missed($opt_spamlog); -#} -#elsif ($opt_mh) { -#find_missed($opt_hamlog); -#} +# Auto-detect --dist option +if (!defined $opt_dist) { + if (-f "$FindBin::Bin/../spamassassin.raw") { + warn "Automatically using --dist. Assuming you are running from the unpacked tarball. Use --no-dist to override."; + $opt_dist = 1; + } +} -$spamtest = new Mail::SpamAssassin ({ - 'debug' => $opt_debug, - 'rules_filename' => $opt_c, - 'userprefs_filename' => "$opt_p/user_prefs", - 'site_rules_filename' => "$opt_p/local.cf", - 'userstate_dir' => "$opt_p", - 'save_pattern_hits' => $opt_loghits, - 'dont_copy_prefs' => 1, - 'local_tests_only' => $opt_net ? 0 : 1, - 'only_these_rules' => $opt_rules, - 'ignore_safety_expire_timeout' => 1, - PREFIX => '', - DEF_RULES_DIR => $opt_c, - LOCAL_RULES_DIR => '', -}); +my $local_rules_dir; +if ($opt_dist) { # Set defaults + $opt_c ||= "$FindBin::Bin/../rules"; + $opt_p ||= "$FindBin::Bin/mass-check.cf"; + $opt_u ||= "$FindBin::Bin/spamassassin"; + $opt_s ||= "$FindBin::Bin/spamassassin"; + $local_rules_dir = ''; +} +else { + if(!$opt_u) { + # Assuming this is OK, since mass-check isnt supported on windows, is it? + # Also, should there be some check to make sure that previous mass-check stuff isn't in there? + # AFAICT, there isn't otherwise.... + if ( -d "${ENV{HOME}}/.spamassassin" ) { + $opt_u = "${ENV{HOME}}/.spamassassin/mass-check"; + warn "$opt_u already exists -- may contain files that will effect the results" if (-d $opt_u); + mkdir $opt_u, 0700 if (! -d $opt_u); + } + } + +# Leave the rest to SA, we'll get it afterwards + +} + + +$opt_s =~ s/~/$ENV{HOME}/ if $opt_s; +$opt_c =~ s/~/$ENV{HOME}/ if $opt_c; +$opt_p =~ s/~/$ENV{HOME}/ if $opt_p; +$opt_u =~ s/~/$ENV{HOME}/ if $opt_u; + + +my $spamtest = new Mail::SpamAssassin ({ + 'debug' => $opt_debug, + 'rules_filename' => $opt_c, + 'userprefs_filename' => $opt_p, + 'site_rules_filename' => $opt_s, + 'userstate_dir' => $opt_u, + 'save_pattern_hits' => $opt_loghits, + 'dont_copy_prefs' => 1, + 'local_tests_only' => $opt_net ? 0 : 1, + 'only_these_rules' => $opt_rules, + 'ignore_safety_expire_timeout' => 1, + DEF_RULES_DIR => $opt_c, + LOCAL_RULES_DIR => $local_rules_dir, + }); + $spamtest->compile_now(1); -$spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf"); +if ($opt_dist) { + $spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf"); +} my $who = `id -un 2>/dev/null`; chomp $who; my $where = `uname -n 2>/dev/null`; chomp $where; my $when = `date -u`; chomp $when; -my $revision = "unknown"; -if (open(TESTING, "$opt_c/70_testing.cf")) { - chomp($revision = ); - $revision =~ s/.*\$Rev:\s*(\S+).*/$1/; - close(TESTING); +my $revision; + +if ($opt_dist) { + my $rev = "unknown"; + if (open(TESTING, "$opt_c/70_testing.cf")) { + chomp($rev = ); + $rev =~ s/.*\$Rev:\s*(\S+).*/$1/; + close(TESTING); + } + $revision = "SVN revision: $rev"; } +else { + $revision = "Local"; +} + my $log_header = "# mass-check results from $who\@$where, on $when\n" . "# M:SA version ".$spamtest->Version()."\n" . - "# SVN revision: $revision\n" . + "# $revision\n" . "# Perl version: $] on $Config{archname}\n"; + +if (!$opt_dist) { + my @paths = ( $spamtest->{rules_filename}, $spamtest->{site_rules_filename}, $spamtest->{userprefs_filename} ); + $log_header .= "# Using configuration:\n"; + foreach my $file (@paths) { + $log_header .= "# $file\n"; + } +} + my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || `hostname` || 'localhost'; chomp $host; @@ -222,7 +329,7 @@ autoflush STDOUT 1; print STDOUT $log_header; } - else { + elsif ($opt_hamlog || $opt_spamlog) { open(HAM, "> $opt_hamlog"); open(SPAM, "> $opt_spamlog"); autoflush HAM 1; @@ -230,6 +337,11 @@ print HAM $log_header; print SPAM $log_header; } + else { + open(OUT, "> $opt_log"); + autoflush OUT 1; + print OUT $log_header; + } $init_results = 1; } @@ -239,25 +351,36 @@ # don't open results files until we get here to avoid overwriting files &init_results if !$init_results; - if ($class eq "s") { - if ($opt_o) { print STDOUT $result; } else { print SPAM $result; } - $spam_count++; + if ($opt_o) { + print STDOUT $result; } - elsif ($class eq "h") { - if ($opt_o) { print STDOUT $result; } else { print HAM $result; } - $ham_count++; + elsif ($opt_spamlog || $opt_hamlog) { + if ($class eq "s") { + print SPAM $result; + } else { + print HAM $result; + } } + else { + print OUT $result; + } $total_count++; #warn ">> result: $total_count $class $time\n"; if ($opt_progress) { + if ($class eq "s") { + $spam_count++; + } + else { + $ham_count++; + } progress($time); } } sub wanted { - my (undef, $id, $time, $dataref) = @_; + my ($class, $id, $time, $dataref) = @_; my $out; my $ma = $spamtest->parse($dataref, 1); @@ -308,18 +431,22 @@ push(@extra, "mid=$mid"); } - my $yorn; + my $result; my $score; my $tests; my $extra; if ($opt_loguris) { - $yorn = '.'; + $result = '.'; $score = 0; $tests = join(" ", sort @uris); $extra = ''; } else { - $yorn = $status->is_spam() ? 'Y' : '.'; + if ($status->is_spam()) { + $result = "s"; + } else { + $result = "h"; + } $score = $status->get_score(); $tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit()))); $extra = join(",", @extra); @@ -333,7 +460,7 @@ $id =~ s/\s/_/g; - $out .= sprintf("%s %2d %s %s %s\n", $yorn, $score, $id, $tests, $extra); + $out .= sprintf("%s %s %05.2f %s %s %s\n", $class, $result, $score, $id, $tests, $extra); if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) { $out .= logkilled($ma, $id, "possible virus"); Index: masses/mk-baseline-results =================================================================== --- masses/mk-baseline-results (revision 20231) +++ masses/mk-baseline-results (working copy) @@ -10,7 +10,7 @@ echo "Classification success on test corpora, at default threshold:" echo -./logs-to-c --spam=spam-validate.log --nonspam=nonspam-validate.log --threshold 5 --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d' +./fp-fn-statistics --logfile=masses-validate.log --threshold 5 --scoreset=$SCORESET echo echo "Results on test corpora at various alternative thresholds:" @@ -18,7 +18,7 @@ # list a wide range of thresholds, so that we can make graphs later ;) for thresh in -4 -3 -2 -1 0 1 2 3 4 4.5 5.5 6 6.5 7 8 9 10 12 15 17 20 ; do - ./logs-to-c --spam=spam-validate.log --nonspam=nonspam-validate.log --threshold $thresh --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d' + ./fp-fn-statistics --logfile=masses-validate.log --threshold $thresh --scoreset=$SCORESET echo done Index: masses/README =================================================================== --- masses/README (revision 20231) +++ masses/README (working copy) @@ -33,8 +33,6 @@ See the CORPUS_POLICY file for more details. - - HOW TO SUBMIT RESULTS BACK TO US -------------------------------- @@ -52,11 +50,11 @@ This script is used to perform "mass checks" of a set of mailboxes, Cyrus folders, and/or MH mail spools. It generates summary lines like this: - Y 7 /home/jm/Mail/Sapm/1382 SUBJ_ALL_CAPS,SUPERLONG_LINE,SUBJ_FULL_OF_8BITS + s s 07.22 /home/jm/Mail/Sapm/1382 SUBJ_ALL_CAPS,SUPERLONG_LINE,SUBJ_FULL_OF_8BITS or for mailboxes, - . 1 /path/to/mbox:<5.1.0.14.2.20011004073932.05f4fd28@localhost> TRACKER_ID,BALANCE_FOR_LONG + h h 01.32 /path/to/mbox:<5.1.0.14.2.20011004073932.05f4fd28@localhost> TRACKER_ID,BALANCE_FOR_LONG listing the path to the message or its message ID, its score, and the tests that triggered on that mail. @@ -65,23 +63,23 @@ get good hits with few false positives, etc., and re-score the tests to optimise the ratio. - This script relies on the spamassassin distribution directory living in "..". + If given the --dist option, this script relies on the spamassassin + distribution directory living in "..". If this script is not in the + distribution directory, it will generate logs based on the site-wide + rules, as well as personal rules. - logs-to-c : - Takes the "spam.log" and "nonspam.log" files and converts them into C - source files and simplified data files for use by the C score optimization - algorithm. (Called by "make" when you build the perceptron, so generally - you won't need to run it yourself.) + Takes the "masses.log" file and converts them into C source files + and simplified data files for use by the C score optimization + algorithm. (Called by "make" when you build the perceptron, so + generally you won't need to run it yourself.) - hit-frequencies : Analyses the log files and computes how often each test hits, overall, for spam mails and for non-spam. - mk-baseline-results : Compute results for the baseline scores (read from ../rules/*). If you @@ -91,7 +89,6 @@ It will output statistics on the current ruleset to ../rules/STATISTICS.txt, suitable for a release build of SpamAssassin. - perceptron.c : Perceptron learner by Henry Stern. See "README.perceptron" for details. Index: masses/fp-fn-statistics =================================================================== --- masses/fp-fn-statistics (revision 20231) +++ masses/fp-fn-statistics (working copy) @@ -1,3 +1,191 @@ -#!/bin/sh +#!/usr/bin/perl -w +# +# <@LICENSE> +# Copyright 2004 Apache Software Foundation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# -exec ./logs-to-c --count $* +=head1 NAME + +fp-fn-statistics - Display statistics about the quality of scores + +=head1 SYNOPSIS + +fp-fn-statistics [options] + + Options: + -c,--cffile=path Use path as the rules directory + -s,--scoreset=n Use scoreset n + -l,--logfile=file Read in file instead of masses.log + -t,--threshold=n Use a spam/ham threshold of n (default: 5) + --lambda=n Use a lambda value of n + +=head1 DESCRIPTION + +B first calculates the score each message from a +masses.log would have under a new set of scores. It then aggregates +the number of messages correctly and incorrectly found as spam and +ham, and their average scores. + +In addition, B determines the "Total Cost Ratio" as +a result of the false positives and negatives mentioned above. This +calculation takes into the value of lambda, which represents the cost +of recovering a false positive, where 1 indicates a message is tagged +only, 9 means the message is mailed back to sender asking for a token +(TMDA style) and 999 means a message is delted. The default, 5, +represents the message being moved to an infrequently read folder. + +=cut + +use FindBin; +use lib "$FindBin::Bin/../lib"; +use Mail::SpamAssassin::Masses; +use Getopt::Long qw(:config bundling auto_help); +use Pod::Usage; +use strict; +use warnings; + +use vars qw{$opt_c $opt_l $opt_s $opt_t $opt_lambda}; + +GetOptions("c|cffile=s@" => \$opt_c, + "l|logfile=s" => \$opt_l, + "s|scoreset=i" => \$opt_s, + "t|threshold=f" => \$opt_t, + "lambda" => \$opt_lambda); + +$opt_l ||= "masses.log"; + +if (!$opt_c || !scalar(@$opt_c)) { + # Try to read this in from the log, if possible + open IN, $opt_l or die "Can't open $opt_l: $!"; + my $files = 0; # are we in the files section? + while() { + if (!$files) { + if (/^\# SVN revision:/) { + $opt_c = [ "$FindBin::Bin/../rules" ]; + last; + } elsif (/^\# Using configuration:$/) { + $files = 1; + } + } elsif (/^\#\s+(.*)\s*$/) { + push (@$opt_c, $1); + } else { + # All done! + last; + } + } + + if (!defined $opt_c) { + $opt_c = [ "$FindBin::Bin/../rules" ]; + } + + foreach my $file (@$opt_c) { + die "Can't read $file" unless -r $file; + } +} + +$opt_t = (defined($opt_t) ? $opt_t : 5); +$opt_s ||= 0; +$opt_lambda ||= 5; + +my $nybias = 10; + + +my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c, + scoreset => $opt_s, # ,, + logfile => $opt_l}); + +$masses->readlogs(); + +my $logs = $masses->get_logs(); + +my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore) = (0,0,0,0,0,0,0,0); + +my $num_spam = $masses->get_num_spam(); +my $num_ham = $masses->get_num_ham(); +my $num_logs = $num_spam + $num_ham; + +my $count = 0; + +my $score; + +foreach my $log (@$logs) { + + $score = 0; + foreach my $test (@{$log->{tests_hit}}) { + + next if ($test->{issubrule}); + next if (!$test->{score}); + + $score += $test->{score}; + + } + + if ($score >= $opt_t) { + if ($log->{isspam}) { + $ga_yy++; + $yyscore += $score; + } + else { + $ga_ny++; + $nyscore += $score; + } + } else { + if ($log->{isspam}) { + $ga_yn++; + $ynscore += $score; + } + else { + $ga_nn++; + $nnscore += $score; + } + } +} + +$nybias = $nybias * ($num_spam / $num_ham); + +my $fprate = ($ga_ny / $num_logs) * 100.0; +my $fnrate = ($ga_yn / $num_logs) * 100.0; + +printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_t); +printf "# Correctly non-spam: %6d %4.2f%% (%4.2f%% of non-spam corpus)\n", $ga_nn, + ($ga_nn / $num_logs) * 100.0, ($ga_nn / $num_ham) * 100.0; +printf "# Correctly spam: %6d %4.2f%% (%4.2f%% of spam corpus)\n" , $ga_yy, + ($ga_yy / $num_logs) * 100.0, ($ga_yy / $num_spam) * 100.0; +printf "# False positives: %6d %4.2f%% (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny, + $fprate, ($ga_ny / $num_ham) * 100.0, $nyscore*$nybias; +printf "# False negatives: %6d %4.2f%% (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn, + $fnrate, ($ga_yn / $num_spam) * 100.0, $ynscore; + +# convert to the TCR metrics used in the published lit +my $nspamspam = $ga_yy; +my $nspamlegit = $ga_yn; +my $nlegitspam = $ga_ny; +my $nlegitlegit = $ga_yn; +my $nlegit = $num_ham; +my $nspam = $num_spam; + +my $werr = ($opt_lambda * $nlegitspam + $nspamlegit) + / ($opt_lambda * $nlegit + $nspam); + +my $werr_base = $nspam + / ($opt_lambda * $nlegit + $nspam); + +$werr ||= 0.000001; # avoid / by 0 +my $tcr = $werr_base / $werr; + +my $sr = ($nspamspam / $nspam) * 100.0; +my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0; +printf "# TCR: %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%% FP: %3.2f%% FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate; + Index: masses/extract-message-from-mbox =================================================================== --- masses/extract-message-from-mbox (revision 20231) +++ masses/extract-message-from-mbox (working copy) @@ -19,30 +19,61 @@ use bytes; use vars qw { - $opt_f $opt_h $opt_m $opt_H + $opt_h $opt_m }; -use Getopt::Std; -getopts("f:hmH"); -sub usage { - die "extract-message-from-mbox [-f=file] [-m] [-H] offset +use Getopt::Long qw(:config bundling auto_help); +use Pod::Usage; - Extracts the message starting at offset from file (or stdin). Very - useful in combination with mass-check logs and mboxes. If the -m - option is used, the input should be in \"mass-check\" format (as - output by mass-check). Use the -H option to just output headers. -"; -} +GetOptions("m|mass-check" => \$opt_m, "h|H|headers" => \$opt_h); -usage() if($opt_h || (!defined($ARGV[0]) && !$opt_m)); -my $offset = $ARGV[0]; +=head1 NAME +extract-message-from-mbox - Extract a message from an mbox + +=head1 SYNOPSIS + + extract-message-from-mbox [--headers] . + extract-message-from-mbox --mass-check + + Options: + -h, --headers Display only message headers + -m, --masscheck Read mass-check output from stdin + +=head1 DESCRIPTION + +B extracts the message from I +starting at the byte offset I. Very useful in combination with +mass-check logs and mboxes. If the -m or --mass-check option is used, +the input should be in "mass-check" format (as output by +mass-check). Use the -H option to just output headers. + +=head1 EXAMPLES + +To show messages that hit the rule BAYES_99 + +grep BAYES_99 masses.log | extract-message-from-mbox -m + +To show the message indicated by "/path/to/my/mbox.1234" + +extract-message-from-mbox /path/to/my/mbox.1234 + +=cut + + + if($opt_m) { masscheck(); } else { - $opt_f ||= '&STDIN'; - extract($opt_f, $offset); + foreach my $message (@ARGV) { + if ($message =~ /^(.*?)(?:\.(\d+))?$/) { + extract($1, ($2 || 0)); + } + else { + pod2usage("Argument must be of the form ."); + } + } } sub extract { @@ -61,14 +92,14 @@ $found++ if(/^From /); last if($found == 3); print; - last if ($opt_H && /^$/) # empty line? end of headers + last if ($opt_h && /^$/) # empty line? end of headers } } } sub masscheck { while () { - my $mail = (split(/\s+/, $_))[2]; + my $mail = (split(/\s+/, $_))[3]; $mail =~ tr/_/ /; if ($mail =~ /^(.*)\.(\d+)$/) { extract($1, $2); Index: masses/logs-to-c =================================================================== --- masses/logs-to-c (revision 20231) +++ masses/logs-to-c (working copy) @@ -16,257 +16,250 @@ # limitations under the License. # -use Getopt::Long; -use vars qw($opt_cffile $opt_count $opt_lambda $opt_threshold - $opt_spam $opt_nonspam); +=head1 NAME -GetOptions("cffile=s", "count", "lambda=f", "threshold=f", "spam=s", "nonspam=s", "scoreset=i"); -my $argcffile = $opt_cffile; +logs-to-c - Convert a mass-check log into perceptron format -my $justcount = 0; -if ($opt_count) { $justcount = 1; } +=head1 SYNOPSIS -my $threshold = 5; -if (defined $opt_threshold) { $threshold = $opt_threshold; } +logs-to-c [options] -$opt_spam ||= 'spam.log'; -$opt_nonspam ||= 'ham.log'; -$opt_scoreset = 0 if ( !defined $opt_scoreset ); + Options: + -c,--cffile=path Use path as the rules directory + -s,--scoreset=n Use scoreset n + -l,--logfile=file Read in file instead of masses.log + -o,--outputdir Put output in the specified dir (default tmp/) -my $nybias = 10; +=head1 DESCRIPTION -# lambda value for TCR equation, indicating the "cost" of recovering -# from an FP. The values are: 1 = tagged only, 9 = mailed back to -# sender asking for token (TMDA style), 999 = deleted outright. -# We (SpamAssassin) use a default of 5, representing "moved to -# infrequently-read folder". +B will read the mass-check log F or as +specified by the B<--logfile> option, and convert it into the format +needed by the perceptron. This is a format that is simple for the +perceptron to parse, but is not very readable to humans. -my $lambda = 5; -if ($opt_lambda) { $lambda = $opt_lambda; } +By default, output will be put in the directory ./tmp/ unless another +directory is specified by the B<--outputdir> option. (Note: at the +current time, this must be /tmp/ in order for the perceptron to +compile properly.) -my %is_spam = (); -my %tests_hit = (); -my %mutable_tests = (); +=head1 BUGS -use vars qw(%rules %allrules); +Please report bugs to http://bugzilla.spamassassin.org/ -readscores(); +=head1 SEE ALSO -print "Reading per-message hit stat logs and scores...\n"; -my ($num_tests, $num_spam, $num_nonspam); -my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore); +L, L, L -readlogs(); -read_ranges(); +=cut -if ($justcount) { - $nybias = $nybias*($num_spam / $num_nonspam); - evaluate(); -} else { - print "Writing logs and current scores as C code...\n"; - writescores_c(); -} -exit 0; +use FindBin; +use lib "$FindBin::Bin/../lib"; +use Mail::SpamAssassin::Masses; +use Getopt::Long qw(:config bundling auto_help); +use Pod::Usage; +use strict; +use warnings; +use vars qw{$opt_c $opt_l $opt_s $opt_o}; -sub readlogs { - my $count = 0; - $num_spam = $num_nonspam = 0; +GetOptions("c|cffile=s@" => \$opt_c, + "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode) + "l|logfile=s" => \$opt_l, + "o|output=s" => \$opt_o); - if ($justcount) { - $ga_yy = $ga_ny = $ga_yn = $ga_nn = 0; - $yyscore = $ynscore = $nyscore = $nnscore = 0.0; - } - foreach my $file ($opt_spam, $opt_nonspam) { - open (IN, "<$file"); +$opt_o ||= "./tmp/"; +if (!-d $opt_o) { + mkdir $opt_o, 0777 or die "Can't mkdir $opt_o"; +} - while () { - next if /^\#/; - next if /^$/; - if($_ !~ /^.\s+([-\d]+)\s+\S+\s*/) { warn "bad line: $_"; next; } - my $hits = $1; -#my $foo = $_; - $_ = $'; s/(?:bayes|time)=\S+//; s/,,+/,/g; s/^\s+//; s/\s+$//; +$opt_l ||= "masses.log"; - my $score = 0; - my @tests = (); - foreach my $tst (split (/,/, $_)) { - next if ($tst eq ''); - if (!defined $scores{$tst}) { - #warn "unknown test in $file, ignored: $tst\n"; - next; +if (!$opt_c || !scalar(@$opt_c)) { + # Try to read this in from the log, if possible + open IN, $opt_l or die "Can't open $opt_l: $!"; + my $files = 0; # are we in the files section? + while() { + if (!$files) { + if (/^\# SVN revision:/) { + $opt_c = [ "$FindBin::Bin/../rules" ]; + last; + } elsif (/^\# Using configuration:$/) { + $files = 1; + } + } elsif (/^\#\s+(.*)\s*$/) { + push (@$opt_c, $1); + } else { + # All done! + last; } + } - # Make sure to skip any subrules! - next if ( $allrules{$tst}->{issubrule} ); + if (!defined $opt_c) { + $opt_c = [ "$FindBin::Bin/../rules" ]; + } - if ($justcount) { - $score += $scores{$tst}; - } else { - push (@tests, $tst); - } - } - - if (!$justcount) { - $tests_hit{$count} = \@tests; - } - - if ($file eq $opt_spam) { - $num_spam++; - if ($justcount) { - if ($score >= $threshold) { - $ga_yy++; $yyscore += $score; - } else { - $ga_yn++; $ynscore += $score; - } - } else { - $is_spam{$count} = 1; - } - } else { - $num_nonspam++; - if ($justcount) { - if ($score >= $threshold) { -#print "$score -- $foo"; - $ga_ny++; $nyscore += $score; - } else { - $ga_nn++; $nnscore += $score; - } - } else { - $is_spam{$count} = 0; - } - } - $count++; + foreach my $file (@$opt_c) { + die "Can't read $file" unless -r $file; } - close IN; - } - $num_tests = $count; } +# ignore rules that are subrules -- we don't generate scores for them... -sub readscores { - if (!defined $argcffile) { $argcffile = "../rules"; } - print "Reading scores from \"$argcffile\"...\n"; - system ("./parse-rules-for-masses -d \"$argcffile\" -s $opt_scoreset") and die; - require "./tmp/rules.pl"; - %allrules = %rules; # ensure it stays global -} +# Note: this will cause a difference over the old logs-to-c since rank +# is dependent on the frequencies of all rules, not just non-subrules +my $greprules = sub { return 0 if $_[1]->{issubrule}; return 1; }; + +$opt_s ||= 0; # | + +my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c, + scoreset => $opt_s, # ,, + logfile => $opt_l, + greprules => $greprules }); + +$masses->readlogs(); +$masses->do_score_ranges(); + +my $rules = $masses->get_rules_array(); +my $logs = $masses->get_logs(); + +my @index_to_rule; +my $num_spam = $masses->get_num_spam(); +my $num_ham = $masses->get_num_ham(); + +# This is misleading -- num_tests is really num_msgs +my $num_tests = $num_spam + $num_ham; + + +# Write logs and scores as C code +writescores_c(); +writetests_c(); + + sub writescores_c { - my $output = ''; - my $size = 0; + my $mutable = 0; - my $i; + my $output = ''; + my $count = 0; + my $score = 0; - # jm: now, score-ranges-from-freqs has tflags to work from, so - # it will always list all mutable tests. + foreach my $rule (sort {($b->{ismutable} <=> $a->{ismutable}) || + ($a->{name} cmp $b->{name}) } @$rules) { - @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) || - ($mutable_tests{$b} <=> $mutable_tests{$a}) || - ($a cmp $b)} (keys %scores); - my $max_hits_per_msg = 0; - for ($file = 0; $file < $num_tests; $file++) { - my(@hits) = - grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (@{$tests_hit{$file}}); - if ((scalar(@hits)+1) > $max_hits_per_msg) { - $max_hits_per_msg = scalar(@hits)+1; - } - } + $score = $rule->{score}; - for ($i = 0; $i <= $#index_to_rule; $i++) { - my $name = $index_to_rule[$i]; - $rule_to_index{$name} = $i; + # ignored rules (i.e. no scores) + next unless $score; - if ($ignored_rule{$name}) { next; } + # also ignore rules with score range 0 + next if (!$rule->{range_lo} && !$rule->{range_hi}); - if ($mutable_tests{$name} == 0) { - $range_lo{$name} = $range_hi{$name} = $scores{$name}; - } else { + # Set an index + $rule->{index} = $count; + $index_to_rule[$count] = $rule; # add the reference to the array + + if ($rule->{ismutable}) { $mutable++; - if ($range_lo{$name} > $range_hi{$name}) { - ($range_lo{$name},$range_hi{$name}) = - ($range_hi{$name},$range_lo{$name}); + if ($score > $rule->{range_hi}) { + $score = $rule->{range_hi} - 0.001; + } elsif ($score < $rule->{range_lo}) { + $score = $rule->{range_lo} + 0.001; } - #$range_lo{$name} ||= 0.1; - #$range_hi{$name} ||= 1.5; } + # These should all be set properly if not mutable + # score = range_lo = range_hi + else { + warn "hi != lo for " . $rule->{name} . "!" if $rule->{range_lo} != $rule->{range_hi}; + $score = $rule->{range_hi} = $rule->{range_lo}; + } - $output .= ".".$i."\n". - "n".$name."\n". - "b".$scores{$name}."\n". - "m".$mutable_tests{$name}."\n". - "l".$range_lo{$name}."\n". - "h".$range_hi{$name}."\n"; - $size++; + $output .= "." . $count . "\n" . + "n" . $rule->{name} . "\n" . + "b" . $score . "\n" . + "m" . $rule->{ismutable} . "\n" . + "l" . $rule->{range_lo} . "\n" . + "h" . $rule->{range_hi} . "\n"; + + $count++; + } + # Output this - open (DAT, ">tmp/scores.data"); - print DAT "N$size\n", "M$mutable\n", # informational only - $output; + open (DAT, ">$opt_o/scores.data"); + print DAT "N$count\n", "M$mutable\n"; # informational + print DAT $output; close DAT; - open (OUT, ">tmp/scores.h"); - print OUT " + open (OUT, ">$opt_o/scores.h"); + print OUT < #include #include - -int num_scores = $size; + +int num_scores = $count; int num_mutable = $mutable; -unsigned char is_mutable[$size]; -double range_lo[$size]; -double range_hi[$size]; -double bestscores[$size]; -char *score_names[$size]; -double tmp_scores[$size][2]; +unsigned char is_mutable[$count]; +double range_lo[$count]; +double range_hi[$count]; +double bestscores[$count]; +char *score_names[$count]; +double tmp_scores[$count][2]; unsigned char ny_hit[$mutable]; unsigned char yn_hit[$mutable]; - + double lookup[$mutable]; - + /* readscores() is defined in tests.h */ +EOF -"; close OUT; - writetests_c($max_hits_per_msg); # make sure $rule_to_index is around } + sub writetests_c { - my $max_hits_per_msg = $_[0]; - my(%uniq_files) = (); - my(%count_keys) = (); - my(%file_key) = (); + my $max_hits_per_msg = 0; + my @goodtests; + my %uniq_logs; + my $uniq_key; - my $file; + my $i = 0; - for ($file = 0; $file < $num_tests; $file++) - { - my $uniq_key = $is_spam{$file} . " "; + # This will "compress" the logs so that one log entry can have a + # "count" of n indicating it reprents n similar messages - my(@good_tests) = - grep {length($_) && (! $ignored_rule{$_}) && - (defined($rule_to_index{$_}))} (@{ $tests_hit{$file} }); + foreach my $log (@$logs) { - @good_tests = sort {$a <=> $b} (map {$rule_to_index{$_}} (@good_tests)); + (@goodtests) = grep {exists($_->{index})} (@{$log->{tests_hit}}); + @goodtests = sort {$a <=> $b} map {$_->{index}} @goodtests; - $uniq_key .= join(" ",@good_tests); + if($max_hits_per_msg < scalar(@goodtests)) { + $max_hits_per_msg = scalar(@goodtests); + } - if (exists($count_keys{$uniq_key})) { - $count_keys{$uniq_key}++; + $uniq_key = $log->{isspam} ? "s" : ""; + $uniq_key .= join(" ", @goodtests); + + + # The %count_keys hash's entries will be the log info for each unique log + # $log->{count} is increased to indicate similar logs + + if (exists($uniq_logs{$uniq_key})) { + $uniq_logs{$uniq_key}->{count}++; } else { - $count_keys{$uniq_key} = 1; - $file_key{$file} = $uniq_key; - $uniq_files{$file} = scalar(keys(%count_keys)) - 1; + $uniq_logs{$uniq_key} = $log; + $uniq_logs{$uniq_key}->{count} = 1; } + } - my $num_nondup = scalar(keys(%uniq_files)); + my $num_nondup = scalar(keys %uniq_logs); - open (TOP, ">tmp/tests.h"); - print TOP " + open TOP, ">$opt_o/tests.h"; + print TOP < #include #include @@ -274,7 +267,7 @@ int num_tests = $num_tests; int num_nondup = $num_nondup; int num_spam = $num_spam; -int num_nonspam = $num_nonspam; +int num_nonspam = $num_ham; int max_hits_per_msg = $max_hits_per_msg; unsigned char num_tests_hit[$num_nondup]; unsigned char is_spam[$num_nondup]; @@ -282,196 +275,77 @@ double scores[$num_nondup]; double tmp_total[$num_nondup]; int tests_count[$num_nondup]; +EOF -"; - $_ = join ('', ); - print TOP $_; + + print TOP join('', ); close TOP; - open (DAT, ">tmp/tests.data"); - foreach $file (sort {$a <=> $b} (keys %uniq_files)) { - print DAT ".".$uniq_files{$file}."\n"; + open (DAT, ">$opt_o/tests.data"); - my $out = ''; - $out .= "s".$is_spam{$file}."\n"; + my $out; + my $base_score; + my $num_tests_hit; - my $base_score = 0; - my $num_tests_hit = 0; - foreach my $test (@{$tests_hit{$file}}) { - if ($test eq '') { next; } + $i = 0; + foreach my $log (values %uniq_logs) { + $out = ''; + $base_score = $num_tests_hit = 0; - if ($ignored_rule{$test}) { - warn "ignored rule $test got a hit in $file!\n"; - next; + print DAT "." . $i . "\n"; + + $out .= "s" . ( ($log->{isspam})? 1 : 0 ) . "\n"; + + foreach my $test (@{$log->{tests_hit}}) { + if (!$test->{score}) { + # Don't really know why this happens, but the old logs-to-c + #did it too + + warn "ignored rule " . $test->{name} . " got a hit!"; + next; } - if (!defined $rule_to_index{$test}) { - warn "test with no C index: $test\n"; + if (!$test->{range_lo} && !$test->{range_hi}) { + # We ignored this rule next; } - if ($mutable_tests{$test}) { - $num_tests_hit++; - $out .= "t".$rule_to_index{$test}."\n"; - - if ($num_tests_hit >= $max_hits_per_msg) { - die "Need to increase \$max_hits_per_msg"; + # debugging... + if (!defined $test->{index}) { + warn "test with no index"; } - } else { - $base_score += $scores{$test}; - } - } - $out .= "b" . $base_score . "\n"; # score to add in for non-mutable tests - $out .= "c" . $count_keys{$file_key{$file}} . "\n"; + if ($test->{ismutable}) { + $num_tests_hit++; + $out .= "t".$test->{index}."\n"; - print DAT "n".$num_tests_hit."\n".$out; - } - close DAT; -} + if ($num_tests_hit >= $max_hits_per_msg) { + die "\$max_hits_per_msg not big enough!"; + } -sub read_ranges { - if (!-f 'tmp/ranges.data') { - system ("make tmp/ranges.data"); - } + } + else { + $base_score += $test->{score}; + } - # read ranges, and mutableness, from ranges.data. - open (IN, ") { - /^(\S+) (\S+) (\d+) (\S+)$/ or next; - my $t = $4; - $range_lo{$t} = $1+0; - $range_hi{$t} = $2+0; - my $mut = $3+0; - - if ($allrules{$t}->{issubrule}) { - $ignored_rule{$t} = 1; - $mutable_tests{$t} = 0; - next; } - if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) { - #warn "ignored rule: score and range == 0: $t\n"; - $ignored_rule{$t} = 1; - $mutable_tests{$t} = 0; - next; - } - $ignored_rule{$t} = 0; - $index_to_rule[$count] = $t; - $count++; + $out .= "b" . $base_score . "\n"; # score to add for non-mutable tests + $out .= "c" . $log->{count} . "\n"; # number of identical logs - if (!$mut) { - $mutable_tests{$t} = 0; - } elsif ($range_lo{$t} == $range_hi{$t}) { - $mutable_tests{$t} = 0; - } elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) { - $mutable_tests{$t} = 0; - } else { - $mutable_tests{$t} = 1; - } - unless ($mutable_tests{$t} || $scores{$t}) { - $ignored_rule{$t} = 1; - } - } - close IN; + print DAT "n" . $num_tests_hit . "\n" . $out; - # catch up on the ones missed; seems to be userconf or 0-hitters mostly. - foreach my $t (sort keys %allrules) { - next if (exists($range_lo{$t})); - if ($allrules{$t}->{issubrule}) { - $ignored_rule{$t} = 1; - $mutable_tests{$t} = 0; - next; - } - $ignored_rule{$t} = 0; - unless (exists($mutable_tests{$t}) && - ($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) { - $mutable_tests{$t} = 0; - } - unless ($mutable_tests{$t} || $scores{$t}) { - $ignored_rule{$t} = 1; - } - $index_to_rule[$count] = $t; - $count++; + $i++; } - foreach my $t (keys %range_lo) { - next if ($ignored_rule{$t}); - if ($mutable_tests{$t}) { - if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { - $scores{$t} = -1; - } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) && - ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { - $scores{$t} = -0.01; - } - if ($scores{$t} >= $range_hi{$t}) { - $scores{$t} = $range_hi{$t} - 0.001; - } elsif ($scores{$t} <= $range_lo{$t}) { - $scores{$t} = $range_lo{$t} + 0.001; - } - } else { - if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) { - next; - } elsif ($range_lo{$t} == $range_hi{$t}) { - $scores{$t} = $range_lo{$t}; - next; - } - if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { - $scores{$t} = -1; - } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) && - ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) { - $scores{$t} = -0.01; - } - if ($scores{$t} > $range_hi{$t}) { - $scores{$t} = $range_hi{$t}; - } elsif ($scores{$t} < $range_lo{$t}) { - $scores{$t} = $range_lo{$t}; - } - } - } -} -sub evaluate { - my $fprate = ($ga_ny / $num_tests) * 100.0; - my $fnrate = ($ga_yn / $num_tests) * 100.0; + close DAT; - printf ("\n# SUMMARY for threshold %3.1f:\n", $threshold); - printf "# Correctly non-spam: %6d %4.2f%% (%4.2f%% of non-spam corpus)\n", $ga_nn, - ($ga_nn / $num_tests) * 100.0, ($ga_nn / $num_nonspam) * 100.0; - printf "# Correctly spam: %6d %4.2f%% (%4.2f%% of spam corpus)\n" , $ga_yy, - ($ga_yy / $num_tests) * 100.0, ($ga_yy / $num_spam) * 100.0; - printf "# False positives: %6d %4.2f%% (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny, - $fprate, ($ga_ny / $num_nonspam) * 100.0, $nyscore*$nybias; - printf "# False negatives: %6d %4.2f%% (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn, - $fnrate, ($ga_yn / $num_spam) * 100.0, $ynscore; - # convert to the TCR metrics used in the published lit - my $nspamspam = $ga_yy; - my $nspamlegit = $ga_yn; - my $nlegitspam = $ga_ny; - my $nlegitlegit = $ga_yn; - my $nlegit = $num_nonspam; - my $nspam = $num_spam; - - my $werr = ($lambda * $nlegitspam + $nspamlegit) - / ($lambda * $nlegit + $nspam); - - my $werr_base = $nspam - / ($lambda * $nlegit + $nspam); - - $werr ||= 0.000001; # avoid / by 0 - my $tcr = $werr_base / $werr; - - my $sr = ($nspamspam / $nspam) * 100.0; - my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0; - printf "# TCR: %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%% FP: %3.2f%% FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate; } -__DATA__ +__DATA__ void loadtests (void) { FILE *fin = fopen ("tmp/tests.data", "r"); char buf[256]; @@ -557,4 +431,3 @@ printf ("Read scores for %d tests.\n", num_scores); } - Index: masses/post-ga-analysis.pl =================================================================== --- masses/post-ga-analysis.pl (revision 20231) +++ masses/post-ga-analysis.pl (working copy) @@ -7,9 +7,8 @@ my %scores; my %rulehit; -open(SPAM, ") { @@ -22,11 +21,12 @@ close(SCORES); -while() +while() { next if /^#/; - /.\s+[-0-9]*\s+[^\s]+\s+([^\s]*)(\s+?:(?:bayes|time)=\S+)\s*?$/; - my @rules=split /,/,$1; + /(.)\s+.\s+[-0-9]*\s+[^\s]+\s+([^\s]*)(\s+?:(?:bayes|time)=\S+)\s*?$/; + my $class = $1; + my @rules=split /,/,$2; my $score = 0.0; foreach $rule (@rules) { @@ -35,7 +35,7 @@ $rulehit{$rule}++; } - if($score < 5) + if($class eq "s" && $score < 5) { foreach $rule (@rules) { @@ -44,27 +44,8 @@ } $nfn++; } -} - -close(SPAM); - -while() -{ - next if /^#/; - /.\s+[-0-9]*\s+[^\s]+\s+([^\s]*)\s*$/; - next unless defined($1); - - my @rules=split /,/,$1; - my $score = 0.0; - foreach $rule (@rules) + if($class eq "h" && score >= 5) { - next unless (defined ($scores{$rule})); - $score += $scores{$rule}; - $rulehit{$rule}++; - } - - if($score >= 5) - { foreach $rule (@rules) { next unless (defined ($scores{$rule})); @@ -72,8 +53,11 @@ } $nfp++; } + } +close(LOGS); + @fpk = sort { $falsepos{$b}/($rulehit{$b}||0.0001) <=> $falsepos{$a}/($rulehit{$a}||0.00001) } keys %falsepos; print "COMMON FALSE POSITIVES: ($nfp total)\n-----------------------\n\n"; Index: masses/convert-old-logs-to-new =================================================================== --- masses/convert-old-logs-to-new (revision 0) +++ masses/convert-old-logs-to-new (revision 0) @@ -0,0 +1,15 @@ +#!/bin/sh -e + +cat spam.log | perl -ne's/^Y/s s/; s/^\./s h/; print unless /^\#/;' \ + > spam.log.sorted + +cat ham.log | perl -ne's/^Y/h s/; s/^\./h h/; print unless /^\#/;' \ + > ham.log.sorted + +# sort by time + +echo \# SVN revision: > masses.log + +sort --field-separator='=' -n -k2,2 --merge spam.log.sorted ham.log.sorted \ + >> masses.log + Property changes on: masses/convert-old-logs-to-new ___________________________________________________________________ Name: svn:executable + * Index: masses/score-ranges-from-freqs =================================================================== --- masses/score-ranges-from-freqs (revision 20231) +++ masses/score-ranges-from-freqs (working copy) @@ -1,251 +0,0 @@ -#!/usr/bin/perl -w -# -# <@LICENSE> -# Copyright 2004 Apache Software Foundation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# (rough) graphic demo of this algorithm: -# 0.0 = -limit [......] 0 ........ limit -# 0.25 = -limit ..[..... 0 .]...... limit -# 0.5 = -limit ....[... 0 ...].... limit -# 0.75 = -limit ......[. 0 .....].. limit -# 1.0 = -limit ........ 0 [......] limit -my $sliding_window_limits = 4.8; # limits = [-$range, +$range] -my $sliding_window_size = 5.5; # scores have this range within limits - -# 0.0 = -limit [......] 0 ........ limit -# 0.25 = -limit ....[... 0 ]....... limit -# 0.5 = -limit ......[. 0 .]...... limit (note: tighter) -# 0.75 = -limit .......[ 0 ...].... limit -# 1.0 = -limit ........ 0 [......] limit -my $shrinking_window_lower_base = 0.00; -my $shrinking_window_lower_range = 1.00; # *ratio, added to above -my $shrinking_window_size_base = 1.00; -my $shrinking_window_size_range = 1.00; # *ratio, added to above - -my $use_sliding_window = 0; - -my $argcffile = shift @ARGV; -my $scoreset = shift @ARGV; -$scoreset = 0 if ( !defined $scoreset ); - -if (defined ($argcffile) && $argcffile eq '-test') { - # use this to debug the ranking -> score-range mapping: - for $rat (0.0, 0.25, 0.5, 0.75, 1.0) { - my ($lo, $hi); if ($use_sliding_window) { - ($lo, $hi) = sliding_window_ratio_to_range($rat); - } else { - ($lo, $hi) = shrinking_window_ratio_to_range($rat); - } - warn "test: $rat => [ $lo $hi ]\n"; - } exit; -} - -my %freq_spam = (); -my %freq_nonspam = (); - -my $num_spam; -my $num_nonspam; -my $num_total; - -my %mutable_tests = (); -my %ranking = (); -my %soratio = (); -my %is_nice = (); - -if (!defined $argcffile) { $argcffile = "../rules"; } -system ("./parse-rules-for-masses -d \"$argcffile\" -s $scoreset") and die; -if (-e "tmp/rules.pl") { - # Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem - require "./tmp/rules.pl"; -} -else { - die "parse-rules-for-masses had no error but no tmp/rules.pl!?!"; -} - -while (<>) { - /^\s*([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+\S+\s+(.+)\s*$/ or next; - - my $overall = $1+0; - my $spam = $2+0; - my $nonspam = $3+0; - my $soratio = $4+0; - my $ranking = $5+0; - my $test = $6; - - if ($test eq '(all messages)') { - $num_spam = $spam; - $num_nonspam = $nonspam; - $num_total = $spam+$nonspam; - next; - } - next if ($test eq '(all messages as %)'); - - if (!defined ($rules{$test})) { - warn "rule $test no longer exists; ignoring\n"; - next; - } - - $freq{$test} = $overall; - $freq_spam{$test} = $spam; - $freq_nonspam{$test} = $nonspam; - - my $tflags = $rules{$test}->{tflags}; $tflags ||= ''; - if ($tflags =~ /\buserconf\b/ || - ( ($scoreset % 2) == 0 && $tflags =~ /\bnet\b/ )) { - $mutable_tests{$test} = 0; - } else { - $mutable_tests{$test} = 1; - } - if ($tflags =~ m/\bnice\b/i) { - $is_nice{$test} = 1; - } else { - $is_nice{$test} = 0; - } - - if ($overall < 0.01) { # less than 0.01% of messages were hit - $mutable_tests{$test} = 0; - $soratio{$test} = 0.5; - $ranking{$test} = 0.0; - $rules{$test}->{score} = 0; # tvd - disable these rules automagically - - } else { - $soratio{$test} = $soratio; - $ranking{$test} = $ranking; - } -} - -if ( ! mkdir "tmp", 0755 ) { - warn "Couldn't create tmp directory!: $!\n"; -} - -open (OUT, ">tmp/ranges.data"); -foreach my $test (sort { $ranking{$b} <=> $ranking{$a} } keys %freq) { - if (!defined ($rules{$test})) { - warn "no rule $test"; - print OUT ("0 0 0 $test\n"); - next; - } - - my $overall = $freq{$test}; - my $spam = $freq_spam{$test}; - my $nonspam = $freq_nonspam{$test}; - my $soratio = $soratio{$test}; - my $ranking = $ranking{$test}; - my $mutable = $mutable_tests{$test}; - - if (!$mutable || $rules{$test}->{score} == 0) { # didn't look for score 0 - tvd - printf OUT ("%3.3f %3.3f 0 $test\n", - $rules{$test}->{score}, - $rules{$test}->{score}); - next; - } - - # 0.0 = best nice, 1.0 = best nonnice - if ($is_nice{$test}) { - $ranking = .5 - ($ranking / 2); - } else { - $ranking = .5 + ($ranking / 2); - } - - my ($lo, $hi); - if ($use_sliding_window) { - ($lo, $hi) = sliding_window_ratio_to_range($ranking); - } else { - ($lo, $hi) = shrinking_window_ratio_to_range($ranking); - } - - # tvd - my $tflags = $rules{$test}->{tflags}; $tflags ||= ''; - if ( $is_nice{$test} && ( $ranking < .5 ) ) { # proper nice rule - if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score # -5.4 - $lo *=1.8; - } - elsif ($soratio <= 0.05 && $nonspam > 0.5) { # let good rules be larger if they want to, -4.5 - $lo *= 1.5; - } - - $hi = ($soratio == 0) ? $lo : - ($soratio <= 0.005 ) ? $lo/1.1 : - ($soratio <= 0.010 && $nonspam > 0.2) ? $lo/2.0 : - ($soratio <= 0.025 && $nonspam > 1.5) ? $lo/10.0 : - 0; - - if ( $soratio >= 0.35 ) { # auto-disable bad rules - ($lo,$hi) = (0,0); - } - } - elsif ( !$is_nice{$test} && ( $ranking >= .5 ) ) { # proper spam rule - if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score - $hi *=1.8; - } - elsif ( $soratio >= 0.99 && $spam > 1.0 ) { - $hi *= 1.5; # let good rules be larger if they want to - } - - $lo = ($soratio == 1) ? $hi: - ($soratio >= 0.995 ) ? $hi/4.0 : - ($soratio >= 0.990 && $spam > 1.0) ? $hi/8.0 : - ($soratio >= 0.900 && $spam > 10.0) ? $hi/24.0 : - 0; - - if ( $soratio <= 0.65 ) { # auto-disable bad rules - ($lo,$hi) = (0,0); - } - } - else { # rule that has bad nice setting - ($lo,$hi) = (0,0); - } - $mutable = 0 if ( $hi == $lo ); - - printf OUT ("%3.1f %3.1f $mutable $test\n", $lo, $hi); -} -close OUT; -exit; - -sub sliding_window_ratio_to_range { - my $ratio = shift; - my $lo = -$sliding_window_limits + ($sliding_window_size * $ratio); - my $hi = +$sliding_window_limits - ($sliding_window_size * (1-$ratio)); - if ($lo > $hi) { # ??? - ($lo,$hi) = ($hi,$lo); - } - ($lo, $hi); -} - -sub shrinking_window_ratio_to_range { - my $ratio = shift; - my $is_nice = 0; - my $adjusted = ($ratio -.5) * 2; # adj [0,1] to [-1,1] - if ($adjusted < 0) { $is_nice = 1; $adjusted = -$adjusted; } - -#$adjusted /= 1.5 if ( $ratio < 0.95 && $ratio > 0.15 ); # tvd - - my $lower = $shrinking_window_lower_base - + ($shrinking_window_lower_range * $adjusted); - my $range = $shrinking_window_size_base - + ($shrinking_window_size_range * $adjusted); - my $lo = $lower; - my $hi = $lower + $range; - if ($is_nice) { - my $tmp = $hi; $hi = -$lo; $lo = -$tmp; - } - if ($lo > $hi) { # ??? - ($lo,$hi) = ($hi,$lo); - } - - ($lo, $hi); -} - Index: masses/find-extremes =================================================================== --- masses/find-extremes (revision 20231) +++ masses/find-extremes (working copy) @@ -17,38 +17,144 @@ # limitations under the License. # -use Getopt::Std; -getopts("l:L:h"); +use FindBin; +use lib "$FindBin::Bin/../lib"; +use Mail::SpamAssassin::Masses; +use Getopt::Long qw(:config bundling auto_help); +use Pod::Usage; +use strict; +use warnings; + use vars qw { - $opt_h $opt_l $opt_L +$opt_c $opt_s $opt_l $opt_L $opt_inclang }; -sub usage { - die "find-extremes [-l LC] [-L LC] [spam log] [nonspam log] +GetOptions("c|cffile=s@" => \$opt_c, + "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode) + "l|logfile=s" => \$opt_l, + "L|language=s" => \$opt_L, + "include-language=s" => \$opt_inclang); - -l LC also print language specific rules for lang code LC (or 'all') - -L LC only print language specific rules for lang code LC (or 'all') - options -l and -L are mutually exclusive. - if either the spam or and nonspam logs are unspecified, the defaults - are \"spam.log\" and \"nonspam.log\" in the cwd. +my $lower = 1; +#$threshold = 5; +my $higher = 9; +my $min_expected = 2; # Should not be set to more than 5 or less than 2 -"; + +=head1 NAME + +find-extremes - Determine which rules are most likely to cause false positives/negatives. + +=head1 SYNOPSIS + +hit-frequencies [options] + + Options: + -c,--cffile=path Use path as the rules directory + -s,--scoreset=n Use scoreset n + -l,--logfile=file Read in file instead of masses.log + -L,--language=lc Only print language specific tests for specified lang code (try 'all') + --include-language=lc Also print language specific tests for specified lang code (try 'all') + +=head1 DESCRIPTION + +B will read the mass-check log F or the +log given by the B<--logfile> option. By default, B +will assume the proper values for B<--cffile> based on the header of +the masses.log. The output will include the following columns: + +=over 4 + +=item RULE + +=item CHISQUARE + +=item RATIO_FALSEPOS + +=item OVER_FALSEPOS + +=item FREQ_OVER + +=back + +=head1 BUGS + +This script may or may not work as designed - it probably needs some +tweaking, and I probably introduced a bug into it while re-writing for +the new Masses stuff. + +=head1 NOTES + +This script is poorly documented. Patches welcome. + +=cut + + +$opt_s = 0 unless defined $opt_s; + +my $ok_lang = lc ( $opt_inclang || $opt_L || ''); +$ok_lang = '.' if ($ok_lang eq 'all'); + +my $greprules = sub { + my ($name, $rule) = @_; + + return 0 if (($opt_L && !$rule->{lang}) || + ($rule->{lang} && + (!$ok_lang || $rule->{lang} !~ /^$ok_lang/i))); # Wrong language + + return 0 if ($rule->{tflags} =~ /\bnet\b/); + + return 1; + +}; + +$opt_l ||= "masses.log"; + +if (!$opt_c || !scalar(@$opt_c)) { + # Try to read this in from the log, if possible + open (IN, $opt_l) or die "Can't open $opt_l: $!"; + my $files = 0; # are we in the files section? + while() { + if (!$files) { + if (/^\# SVN revision:/) { + $opt_c = [ "$FindBin::Bin/../rules" ]; + last; + } elsif (/^\# Using configuration:$/) { + $files = 1; + } + } elsif (/^\#\s+(.*)\s*$/) { + push (@$opt_c, $1); + } else { + # All done! + last; + } + } + + foreach my $file (@$opt_c) { + die "Can't read $file" unless -r $file; + } } -usage() if($opt_h || ($opt_l && $opt_L)); +my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c, + scoreset => $opt_s, + greprules => $greprules, + logfile => $opt_l, + nologs => 1}); -$lower = 1; -#$threshold = 5; -$higher = 9; -$min_expected = 2; # Should not be set to more than 5 or less than 2 +$masses->readrules(); +$masses->readlogs(); -my %freq_spam = (); # how often non-nice found in spam +my $rules = $masses->get_rules_hash(); +my $logs = $masses->get_logs(); + +my $num_spam = $masses->get_num_spam(); +my $num_ham = $masses->get_num_ham(); + my %freq_over_higher_falsepos = (); # how often non-nice found in ones over # higher threshold that are false positives -my %freq_nonspam = (); # how often nice found in nonspam my %freq_under_lower_falseneg = (); # how often nice found in ones under # lower threshold that are false negatives @@ -59,43 +165,54 @@ my %ratio_expected_falsepos = (); # ratio version of above my %ratio_expected_falseneg = (); # ditto -my $num_spam = 0; -my $num_nonspam = 0; my $num_over_higher_falsepos = 0; my $num_under_lower_falseneg = 0; -my $ok_lang = ''; -readscores(); +my %chisquare = ( ); +my %prob = ( ); -$ok_lang = lc ($opt_l || $opt_L || ''); -if ($ok_lang eq 'all') { $ok_lang = '.'; } -foreach my $key (keys %rules) { +foreach my $key (keys %$rules) { - if ( ($opt_L && !$rules{$key}->{lang}) || - ($rules{$key}->{lang} && - (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i) - ) ) { - delete $rules{$key} ; next; - } - - if ($rules{$key}->{tflags} =~ m/net/) { - delete $rules{$key}; - next; - } - if ($rules{$key}->{tflags} !~ m/userconf/) { - if ($rules{$key}->{tflags} =~ m/nice/) { - $freq_nonspam{$key} = 0; + if ($rules->{$key}->{tflags} !~ /\buserconf\b/) { + if ($rules->{$key}->{tflags} =~ m/nice/) { $freq_under_lower_falseneg{$key} = 0; } else { - $freq_spam{$key} = 0; $freq_over_higher_falsepos{$key} = 0; } } + } -readlogs(); +foreach my $log (@$logs) { + if($log->{isspam}) { + # Also need to count plus_hits + my $plus_hits = 0; + foreach my $test (@{$log->{tests_hit}}) { + $plus_hits += $test->{score} if ($test->{score} > 0); + } + + if(($log->{score} <= $lower) && $plus_hits && $plus_hits >= $lower) { + $num_under_lower_falseneg++; + foreach my $test (@{$log->{tests_hit}}) { + $num_under_lower_falseneg++; + $freq_under_lower_falseneg{$test->{name}}++ if exists $freq_under_lower_falseneg{$test->{name}}; + } + } + } + else { + if($log->{score} > $higher) { + $num_over_higher_falsepos++; + foreach my $test (@{$log->{tests_hit}}) { + $num_over_higher_falsepos++; + $freq_over_higher_falsepos{$test->{name}}++ if exists $freq_over_higher_falsepos{$test->{name}}; + } + } + } + +} + unless (($num_over_higher_falsepos >= $min_expected) && ($num_under_lower_falseneg >= $min_expected)) { die "Insufficient extremes in dataset (" . $num_over_higher_falsepos . @@ -119,12 +236,13 @@ } my $ratio_falsepos = $num_over_higher_falsepos/$num_spam; -my $ratio_falseneg = $num_under_lower_falseneg/$num_nonspam; +my $ratio_falseneg = $num_under_lower_falseneg/$num_ham; my $skipped_non_nice = 0; -foreach $rule (keys %freq_spam) { - my $expected = $freq_spam{$rule}*$ratio_falsepos; +# non-nice rules +foreach my $rule (keys %freq_over_higher_falsepos) { + my $expected = $rules->{$rule}->{freq_spam}*$ratio_falsepos; if ($expected <= $min_expected) { $skipped_non_nice++; next; @@ -136,7 +254,7 @@ $freq_over_higher_falsepos{$rule}/$expected; ($chisquare{$rule},$prob{$rule}) = chisquare($num_spam,$num_over_higher_falsepos, - $freq_spam{$rule},$freq_over_higher_falsepos{$rule}); + $rules->{$rule}->{freq_spam},$freq_over_higher_falsepos{$rule}); if ($freq_over_higher_falsepos{$rule} < $expected) { $chisquare{$rule} *= -1; } @@ -146,8 +264,9 @@ my $skipped_nice = 0; -foreach $rule (keys %freq_nonspam) { - my $expected = $freq_nonspam{$rule}*$ratio_falseneg; +# nice rules +foreach my $rule (keys %freq_under_lower_falseneg) { + my $expected = $rules->{$rule}->{freq_ham}*$ratio_falseneg; if ($expected <= $min_expected) { $skipped_nice++; next; @@ -158,8 +277,8 @@ $ratio_expected_falseneg{$rule} = $freq_under_lower_falseneg{$rule}/$expected; ($chisquare{$rule},$prob{$rule}) = - chisquare($num_nonspam,$num_under_lower_falseneg, - $freq_nonspam{$rule},$freq_under_lower_falseneg{$rule}); + chisquare($num_ham,$num_under_lower_falseneg, + $rules->{$rule}->{freq_ham},$freq_under_lower_falseneg{$rule}); if ($freq_under_lower_falseneg{$rule} < $expected) { $chisquare{$rule} *= -1; } @@ -167,8 +286,12 @@ warn "Skipped nice: $skipped_nice\n"; -@rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos); +# The rest is copied verbatim from before - its complicated and not +# commented and should work unchanged except for the freq_spam and +# freq_ham stuff and fixing some use strict stuff +my @rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos); + if (scalar(@rules_falsepos)) { print "RULE\t\tCHISQUARE\tRATIO_FALSEPOS\tOVER_FALSEPOS\tFREQ_OVER ($num_over_higher_falsepos)\n"; my(@rules_falsepos_bad) = @@ -183,7 +306,7 @@ $over_expected_falsepos{$a}) || ($freq_over_higher_falsepos{$b} <=> $freq_over_higher_falsepos{$a})} (@rules_falsepos_bad); - foreach $rule (@rules_falsepos_bad) { + foreach my $rule (@rules_falsepos_bad) { print $rule . "\t" . $prob{$rule} . "\t" . $ratio_expected_falsepos{$rule} . "\t" . $over_expected_falsepos{$rule} . "\t" . @@ -199,9 +322,9 @@ ($chisquare{$a} <=> $chisquare{$b}) || ($ratio_expected_falsepos{$a} <=> $ratio_expected_falsepos{$b}) || - ($freq_spam{$b} <=> - $freq_spam{$a})} (@rules_falsepos_good); - foreach $rule (@rules_falsepos_good) { + ($rules->{$b}->{freq_spam} <=> + $rules->{$a}->{freq_spam})} (@rules_falsepos_good); + foreach my $rule (@rules_falsepos_good) { print $rule . "\t" . $prob{$rule} . "\t" . $ratio_expected_falsepos{$rule} . "\t" . $over_expected_falsepos{$rule} . "\t" . @@ -212,7 +335,7 @@ warn "No over-falsepos to print\n"; } -@rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg); +my @rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg); if (scalar(@rules_falseneg)) { print "RULE\t\tCHISQUARE\tRATIO_FALSENEG\tOVER_FALSENEG\tFREQ_UNDER ($num_under_lower_falseneg)\n"; @@ -228,7 +351,7 @@ $over_expected_falseneg{$a}) || ($freq_under_lower_falseneg{$b} <=> $freq_under_lower_falseneg{$a})} (@rules_falseneg_bad); - foreach $rule (@rules_falseneg_bad) { + foreach my $rule (@rules_falseneg_bad) { print $rule . "\t" . $prob{$rule} . "\t" . $ratio_expected_falseneg{$rule} . "\t" . $over_expected_falseneg{$rule} . "\t" . @@ -244,9 +367,9 @@ ($chisquare{$a} <=> $chisquare{$b}) || ($ratio_expected_falseneg{$a} <=> $ratio_expected_falseneg{$b}) || - ($freq_spam{$b} <=> - $freq_spam{$a})} (@rules_falseneg_good); - foreach $rule (@rules_falseneg_good) { + ($rules->{$b}->{freq_ham} <=> + $rules->{$a}->{freq_ham})} (@rules_falseneg_good); + foreach my $rule (@rules_falseneg_good) { print $rule . "\t" . $prob{$rule} . "\t" . $ratio_expected_falseneg{$rule} . "\t" . $over_expected_falseneg{$rule} . "\t" . @@ -258,97 +381,3 @@ } exit; - -sub readlogs { - my $spam = $ARGV[0] || "spam.log"; - my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log"); - - - (open(NONSPAM,$nonspam)) || - (die "Couldn't open file '$nonspam': $!; stopped"); - - while (defined($line = )) { - if ($line =~ m/^\s*\#/) { - next; - } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) { - my $tests = $1; - my $hits = 0; - my(@tests) = (); - foreach $test (grep {length($_)} (split(/,+/,$tests))) { - if (exists($rules{$test})) { - push @tests, $test; - $hits += $rules{$test}->{score}; - } - } - - if (scalar(@tests)) { - $num_nonspam++; - foreach $test (grep {exists($freq_nonspam{$_})} (@tests)) { - $freq_nonspam{$test}++; - } - if ($hits >= $higher) { - $num_over_higher_falsepos++; - foreach $test (grep - {exists($freq_over_higher_falsepos{$_})} (@tests)) { - $freq_over_higher_falsepos{$test}++; - } - } - } - } elsif ($line =~ m/\S/) { - chomp($line); - warn "Can't interpret line '$line'; skipping"; - } - } - - close(NONSPAM); - - (open(SPAM,$spam)) || (die "Couldn't open file '$spam': $!; stopped"); - - while (defined($line = )) { - if ($line =~ m/^\s*\#/) { - next; - } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) { - my $tests = $1; - my $hits = 0; - my $plus_hits = 0; - my(@tests) = (); - foreach $test (grep {length($_)} (split(/,+/,$tests))) { - if (exists($rules{$test})) { - push @tests, $test; - $hits += $rules{$test}->{score}; - if ($rules{$test}->{score} > 0) { - $plus_hits += $rules{$test}->{score}; - } - } - } - - if (scalar(@tests)) { - $num_spam++; - foreach $test (grep {exists($freq_spam{$_})} (@tests)) { - $freq_spam{$test}++; - } - if (($hits <= $lower) && $plus_hits && - ($plus_hits >= $lower)) { - $num_under_lower_falseneg++; - foreach $test (grep - {exists($freq_under_lower_falseneg{$_})} (@tests)) { - $freq_under_lower_falseneg{$test}++; - } - } - } - } elsif ($line =~ m/\S/) { - chomp($line); - warn "Can't interpret line '$line'; skipping"; - } - } - - close(SPAM); -} - - -sub readscores { - system ("./parse-rules-for-masses") and - die "Couldn't do parse-rules-for-masses: $?; stopped"; - require "./tmp/rules.pl"; -} - Index: masses/tenpass/10pass-compute-tcr =================================================================== --- masses/tenpass/10pass-compute-tcr (revision 20231) +++ masses/tenpass/10pass-compute-tcr (working copy) @@ -6,12 +6,12 @@ do mkdir tmp/10passrules > /dev/null 2>&1 cp ../rules/[0-9]*.cf tmp/10passrules - ./rewrite-cf-with-new-scores $SCORESET ../rules/50_scores.cf \ - tenpass_results/scores.$run > tmp/10passrules/50_scores.cf + ./rewrite-cf-with-new-scores -s $SCORESET --old=../rules/50_scores.cf \ + --new=tenpass_results/scores.$run --out=tmp/10passrules/50_scores.cf \ + --cffile=../rules ./fp-fn-statistics --cffile=tmp/10passrules \ - --spam=tenpass_results/spam.log.$run \ - --nonspam=tenpass_results/ham.log.$run > tmp/stats + --logfile=tenpass_results/masses.log.$run > tmp/stats grep TCR: tmp/stats done Index: masses/tenpass/10pass-run =================================================================== --- masses/tenpass/10pass-run (revision 20231) +++ masses/tenpass/10pass-run (working copy) @@ -1,13 +1,10 @@ #!/bin/sh # change these! -NSBASE=ham-logs -SPBASE=spam-logs -SCORESET="0" +BASE=logs/ passes="1 2 3 4 5 6 7 8 9 10" -mkdir -p tenpass_results -mkdir -p ORIG +mkdir tenpass_results > make.output @@ -17,28 +14,27 @@ echo "Training for corpus $id..." pwd; date - > ORIG/ham-set$SCORESET.log - > ORIG/spam-set$SCORESET.log - + > masses.log echo -n "(using corpora blocks: " for notid in $passes ; do if [ "$notid" != "$id" ] ; then echo -n "$notid " - cat $NSBASE/split-$notid.log >> ORIG/ham-set$SCORESET.log - cat $SPBASE/split-$notid.log >> ORIG/spam-set$SCORESET.log + cat $BASE/split-$notid.log >> masses.log fi done echo "for training)" make clean >> make.output - make >> make.output 2>&1 - ./runGA - pwd - date + make perceptron 2>&1 >> make.output + ./perceptron + pwd; date echo "Saving test data for corpus $id..." - cp $NSBASE/split-$id.log tenpass_results/ham.log.$id - cp $SPBASE/split-$id.log tenpass_results/spam.log.$id - cp gen-set$SCORESET.scores tenpass_results/scores.$id + cp $BASE/split-$id.log tenpass_results/masses.log.$id + + cp perceptron.scores tenpass_results/scores.$id + done + +