Index: lib/Mail/SpamAssassin.pm
===================================================================
--- lib/Mail/SpamAssassin.pm	(revision 20231)
+++ lib/Mail/SpamAssassin.pm	(working copy)
@@ -1263,11 +1263,13 @@
 
     # read a file called "init.pre" in site rules dir *before* all others;
     # even the system config.
+
+    # Save this in $self so that it can be accessed externally (for logging, etc.)
+    $self->{site_rules_filename} ||= $self->first_existing_path (@site_rules_path);
     my $siterules = $self->{site_rules_filename};
-    $siterules ||= $self->first_existing_path (@site_rules_path);
 
+    $self->{rules_filename} ||= $self->first_existing_path (@default_rules_path);
     my $sysrules = $self->{rules_filename};
-    $sysrules ||= $self->first_existing_path (@default_rules_path);
 
     if ($siterules) {
       $fname = File::Spec->catfile ($siterules, "init.pre");
@@ -1300,8 +1302,8 @@
       $self->get_and_create_userstate_dir();
 
       # user prefs file
+      $self->{userprefs_filename} ||= $self->first_existing_path (@default_userprefs_path);
       $fname = $self->{userprefs_filename};
-      $fname ||= $self->first_existing_path (@default_userprefs_path);
 
       if (defined $fname) {
         if (!-f $fname && !$self->{dont_copy_prefs} && !$self->create_default_prefs($fname)) {
Index: lib/Mail/SpamAssassin/Masses.pm
===================================================================
--- lib/Mail/SpamAssassin/Masses.pm	(revision 0)
+++ lib/Mail/SpamAssassin/Masses.pm	(revision 0)
@@ -0,0 +1,788 @@
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+
+=head1 NAME
+
+Mail::SpamAssassin::Masses - Interface for reading and parsing rules
+and mass-check logs for SpamAssassin
+
+=head1 SYNOPSIS
+
+  my $parser = Mail::SpamAssassin::Masses->new();
+  my $rules = $parser->readrules();
+  my $logs = $parser->readlogs();
+
+  foreach my $test (keys %$rules) {
+    if ($rules->{$test}->{score} > 1) {
+      ...
+    }
+
+=head1 DESCRIPTION
+
+Mail::SpamAssassin::Masses is a module to simplify the many scripts
+that used to make up the SpamAssassin re-scoring process. By
+consolidating all the shared code in one module, the scripts can be
+simplified and require fewer temporary files.
+
+=head1 METHODS
+
+=over 4
+
+=cut
+
+package Mail::SpamAssassin::Masses;
+
+use strict;
+use warnings;
+use Carp;
+
+=item $parser = Mail::SpamAssassin::Masses->new( [ { opt => val, ... } ] );
+
+Construct a new Mail::SpamAssassin::Masses object. You may pass the
+following attribute-value pairs to the constructor.
+
+=over 4
+
+=item rulesdir
+
+The directory containing rules. If multiple directories are desired,
+an anonymous array should be passed.
+
+=item scoreset
+
+Scoreset to deal with.
+
+=item logfile
+
+Filename of mass-check log.
+
+=item falses
+
+Also count frequencies for false positives and false negatives from
+the logs.
+
+=item falsesonly
+
+Only count false positives and false negatives.
+
+=item greprule
+
+Coderef that is passed a rule name and a hash ref with the entries
+containing info about the rule. If the sub returns false, it is skipped.
+
+=item greplog
+
+Coderef that is passed a raw log entry. If it returns false, the entry
+is skipped.
+
+=item sliding_window
+
+Use a sliding window for score ranges rather than a shrinking window.
+
+=item nologs
+
+Save memory by not saving the individual log results, just the
+aggregate totals
+
+=back
+
+=cut
+
+sub new {
+
+  my $class = shift;
+  $class = ref($class) || $class;
+
+  my $self = shift;
+  if (!defined $self){
+    $self = { };
+  }
+
+  $self->{scoreset} ||= 0;
+  $self->{rulesdir} ||= '';
+  $self->{logfile} ||= "masses.log";
+
+  bless($self, $class);
+
+  return $self;
+
+}
+
+=item $parser->readrules()
+
+Read and parse the rules from the directory specified as
+C<rulesdir>. This loads the following keys and values into the hash
+entry representing the rules (see below).
+
+=over 4
+
+=item name
+
+Contains the rule's name.
+
+=item score
+
+Contains the rule's score.
+
+=item type
+
+Contains the rule's type (header, body, uri, etc.)
+
+=item tflags
+
+Contains the rules tflags (nice, autolearn, etc.) as specified in the config file.
+
+=item lang
+
+Set to the value of C<lang> for language-specific tests.
+
+=item issubrule
+
+Set to true if the rules is a sub-rule, (i.e. it starts with
+__). Otherwise, undefined.
+
+=item isnice
+
+This key exists and is true if the rule is nice (i.e. with a score
+that can be below zero).
+
+=item describe
+
+Set to the rule's description, in English, or in the rule's language.
+
+=back
+
+There may be more values once C<readlogs()> is run.
+
+=cut
+
+
+sub readrules {
+
+  my $self = shift;
+
+  $self->{rules} ||= { };
+  my $rules = $self->{rules}; # $rules is a reference to the anon hash
+
+  my @dirs = ref($self->{rulesdir}) ? @{$self->{rulesdir}} : $self->{rulesdir};
+
+  my @files;
+
+  foreach my $indir (@dirs) {
+    if (-d $indir) {
+      @files = glob("$indir/*.cf"); # no reason to only do numbered files
+    } else {
+      @files = ( $indir );
+    }
+
+    foreach my $file (@files) {
+      open (IN, "<$file") || croak("Can't open $file, $!");
+      while(<IN>) {
+        s/#.*$//g;
+        s/^\s+//;
+        s/\s+$//;
+        next if /^$/;
+
+        my $lang = '';
+        if (s/^lang\s+(\S+)\s+//) {
+          $lang = lc $1;
+        }
+
+        if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) {
+          my $type = $1;
+          my $name = $2;
+
+          $rules->{$name} ||= { };
+	  $rules->{$name}->{name} = $name;
+          $rules->{$name}->{type} = $type;
+          $rules->{$name}->{lang} = $lang if $lang;
+          $rules->{$name}->{tflags} = '';
+
+          if ($name =~ /^__/) {
+	    $rules->{$name}->{issubrule} = '1';
+	  }
+
+        } elsif (/^describe\s+(\S+)\s+(.+)$/) {
+
+          # Let's get description in english, por favor -- unless the rule isn't english
+
+	  next if ($lang && (!$rules->{$1}->{lang} || $rules->{$1}->{lang} ne $lang));
+
+          $rules->{$1} ||= { };
+          $rules->{$1}->{describe} = $2;
+
+        } elsif (/^tflags\s+(\S+)\s+(.+)$/) {
+	  my $name = $1;
+          $rules->{$name} ||= { };
+          $rules->{$name}->{tflags} = $2;
+	  if ($2 =~ /nice/) {
+	    $rules->{$name}->{isnice} = 1;
+	  }
+        } elsif (/^score\s+(\S+)\s+(.+)$/) {
+          my($name,$score) = ($1,$2);
+          $rules->{$name} ||= { };
+          if ( $score =~ /\s/ ) { # there are multiple scores
+            ($score) = (split(/\s+/,$score))[$self->{scoreset}];
+          }
+          $rules->{$name}->{score} = $score;
+        }
+      }
+      close IN;
+    }
+  }
+  foreach my $rule (keys %{$rules}) {
+    if (!defined $rules->{$rule}->{type}) {
+      delete $rules->{$rule};   # no rule definition -> no rule
+      next;
+    }
+
+    if (!defined $rules->{$rule}->{score}) {
+      my $def = 1.0;
+      if ($rule =~ /^T_/) { $def = 0.01; }
+
+      if ($rules->{$rule}->{isnice}) {
+        $rules->{$rule}->{score} = -$def;
+      } else {
+        $rules->{$rule}->{score} = $def;
+      }
+    }
+
+    if ($self->{greprules} && !&{$self->{greprules}}($rule, $rules->{$rule}))
+    {
+      delete $rules->{$rule};
+      next;
+    }
+
+  }
+
+  $self->{_readrules} = 1;
+}
+
+=item $parser->readlogs()
+
+Read and parse logs from C<logsdir>. This will create the anonymous
+array of hashes referred to by C<$parser->{logs}>, with the following
+keys:
+
+=over 4
+
+=item isspam
+
+True if the message is spam. False or undefined otherwise.
+
+=item isfalse
+
+True if the message was a false negative or positive.
+
+=item tests_hit
+
+Array reference containing references to the hash representing each
+rule hit.
+
+=item score
+
+Score the message received (under current scores).
+
+=back
+
+In addition, this method adds the following keys to the rule
+information in C<$parser->{rules}>.
+
+=over 4
+
+=item freq_spam
+
+Frequency hit in spam.
+
+=item freq_ham
+
+Frequency hit in ham.
+
+=item freq_fp
+
+Frequency in false positives.
+
+=item freq_fn
+
+Frequency in false negatives.
+
+=back
+
+Also, sets C<$parser->{num_spam}> and C<$parser->{num_ham}> to the number of
+spam logs read and the number of ham logs read, respectively.
+
+=cut
+
+sub readlogs {
+
+  my $self = shift;
+
+  if (!$self->{_readrules}) {
+    # need to read scores first!
+    $self->readrules();
+  }
+
+  my $rules = $self->{rules}; # copy the ref, shorthand
+
+  my $logs;
+  if (! $self->{nologs}) {
+    $self->{logs} ||= [ ];
+    $logs = $self->{logs};
+  }
+
+
+  my ($num_spam, $num_ham, $count, $num_fp, $num_fn);
+  $num_spam = $num_ham = $count = $num_fp = $num_fn = 0;
+
+  # First, initialize stuff
+  foreach my $rule (values %{$self->{rules}}) {
+    $rule->{freq_spam} ||= 0;
+    $rule->{freq_ham} ||= 0;
+
+    if($self->{falses}) {
+      $rule->{freq_fp} ||= 0;
+      $rule->{freq_fn} ||= 0;
+    }
+
+  }
+
+  my $file = $self->{logfile};
+  open (IN, "<$file");
+
+  while (<IN>) {
+    next if /^\#/;
+    next if /^$/;
+    if($_ !~ /^(.)\s+(.)\s+-?[\d.]+\s+\S+(\s+\S+\s+)/) { warn "bad line: $_"; next; }
+
+    if ($self->{greplogs} && !&{$self->{greplogs}}($_)) {
+      next;
+    }
+
+    my $manual = $1;
+    my $result = $2;
+    $_ = $3;
+    s/(?:bayes|time)=\S+//;
+    s/,,+/,/g;
+    s/^\s+//;
+    s/\s+$//;
+
+
+    if ($manual ne $result) {
+      $self->{isfalse} = 1;
+    }
+    elsif ($self->{falsesonly}) {
+      next;
+    }
+
+    if ($manual eq "s") {
+      $num_spam++;
+      $logs->[$count]->{isspam} = 1 unless $self->{nologs};
+      $num_fn++ if $result eq "h";
+    } else {
+      $num_ham++;
+      $num_fp++ if $result eq "s";
+    }
+
+    my @tests = ();
+    my $score = 0;
+    foreach my $tst (split (/,/, $_)) {
+      next if ($tst eq '');
+
+      # Don't count non-existant rules
+      # (Could happen with greprules)
+      next if ( !$rules->{$tst} || !$rules->{$tst}->{type} );
+
+      if ($manual eq "s") {
+	  $rules->{$tst}->{freq_spam}++;
+	  $rules->{$tst}->{freq_fn}++ if ($self->{falses} && $result eq "h");
+      }
+      else {
+	  $rules->{$tst}->{freq_ham}++;
+	  $rules->{$tst}->{freq_fp}++ if ($self->{falses} && $result eq "s");
+      }
+
+      $score += $rules->{$tst}->{score};
+
+      push (@tests, $rules->{$tst}) unless $self->{nologs};
+    }
+
+    $logs->[$count]->{tests_hit} = \@tests unless $self->{nologs};
+    $logs->[$count]->{score} = $score;
+
+    $count++;
+  }
+  close IN;
+
+  $self->{num_spam} = $num_spam;
+  $self->{num_ham} = $num_ham;
+  if ($self->{falses}) {
+    $self->{num_fn} = $num_fn;
+    $self->{num_fp} = $num_fp;
+  }
+
+  $self->{_readlogs} = 1; # Done reading logs
+
+}
+
+=item $parser->do_statistics();
+
+Calculate the S/O ratio and the rank for each test.
+
+This adds the following keys to the rules hashes.
+
+=over 4
+
+=item spam_percent
+
+Percentage of spam messages hit.
+
+=item ham_percent
+
+Percentage of ham messages hit.
+
+=item soratio
+
+S/O ratio -- percentage of spam messages hit divided by total
+percentage of messages hit.
+
+=back
+
+=cut
+
+sub do_statistics {
+  my $self = shift;
+
+  if (! $self->{_readlogs} ) {
+    $self->readlogs();
+  }
+
+  my $rank_hi=0;
+  my $rank_lo=999999;
+
+  foreach my $rule (values %{$self->{rules}}) {
+
+    if (!$rule->{freq_spam}) {
+      $rule->{spam_percent} = 0;
+    } else {
+      $rule->{spam_percent} = $rule->{freq_spam} / $self->{num_spam} * 100.0;
+    }
+
+    if (!$rule->{freq_ham}) {
+      $rule->{ham_percent} = 0;
+    } else {
+      $rule->{ham_percent} = $rule->{freq_ham} / $self->{num_ham} * 100.0;
+    }
+
+    if (!$rule->{freq_spam} && !$rule->{freq_ham}) {
+      $rule->{soratio} = 0.5;
+      next;
+    }
+
+    $rule->{soratio} = $rule->{spam_percent} / ($rule->{spam_percent} + $rule->{ham_percent});
+
+  }
+
+  $self->{_statistics} = 1;
+
+}
+
+=item $parser->do_rank();
+
+Calculates the ranking for each rule and stores this in the
+appropriate key.
+
+=over 4
+
+=item rank
+
+"Rank" of the rule. High numbers are good, low are bad.
+
+=back
+
+=cut
+
+sub do_rank {
+
+  my $self = shift;
+
+  if (! $self->{_statistics} ) {
+    $self->do_statistics();
+  }
+
+  my $rank_hi = 0;
+  my $rank_lo = 9999999;
+
+  my %unwanted;
+  my %wanted;
+  my %wranks = ();
+  my %uranks = ();
+  my $rules = $self->{rules};
+
+
+  foreach my $rule (values %{$self->{rules}}) {
+
+    $wanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_ham} : $rule->{freq_spam};
+    $unwanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_spam} : $rule->{freq_ham};
+
+    $wranks{$wanted{$rule->{name}}} = 1;
+    $uranks{$unwanted{$rule->{name}}} = 1;
+
+  }
+
+  my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
+  my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;
+
+  # first half of ranking is the wanted rank
+  my $position = 0;
+  my $last = undef;
+
+  foreach my $test (@wanted) {
+    $position++ if defined $last && $last != $wanted{$test};
+    $rules->{$test}->{rank} += $position;
+    $last = $wanted{$test};
+  }
+
+  # second half is the unwanted rank
+  $position = 0;
+  $last = undef;
+
+  # Avoid divide by 0 errors!
+  die "Error: no rules read" if (!(scalar keys %uranks));
+
+  my $normalize = (scalar keys %wranks) / (scalar keys %uranks);
+
+  foreach my $test (@unwanted) {
+    $position++ if defined $last && $last != $unwanted{$test};
+    $rules->{$test}->{rank} += ($position * $normalize);
+    $last = $unwanted{$test};
+    $rank_hi = $rules->{$test}->{rank} if ($rules->{$test}->{rank} > $rank_hi);
+    $rank_lo = $rules->{$test}->{rank} if ($rules->{$test}->{rank} < $rank_lo);
+  }
+
+  $rank_hi = $rank_hi - $rank_lo;
+  foreach my $rule (values %{$rules}) {
+    $rule->{rank} = ($rank_hi == 0) ? 0.001 : (($rule->{rank} - $rank_lo)/ $rank_hi);
+  }
+
+  $self->{_rank} = 1;
+}
+
+=item $parser->get_rules_array();
+
+Returns a reference to an array of hash references. The values of
+these hash have keys as listed above.
+
+=cut
+
+sub get_rules_array {
+  my $self = shift;
+  return [ values %{$self->{rules}} ];
+}
+
+=item $parser->get_rules_hash();
+
+Returns a reference to a hash with rule names as keys and hash
+references as values. The values of these hash have keys as listed
+above.
+
+=cut
+
+sub get_rules_hash {
+  my $self = shift;
+  return $self->{rules};
+}
+
+=item $parser->get_logs();
+
+Returns a reference to the array containing log entries, in the form
+of anonymous hashes with keys as described above.
+
+=cut
+
+sub get_logs {
+  my $self = shift;
+  return $self->{logs};
+}
+
+=item $parser->get_num_ham();
+
+Returns number of ham logs read.
+
+=cut
+
+sub get_num_ham {
+  my $self = shift;
+  return $self->{num_ham};
+}
+
+=item $parser->get_num_spam();
+
+Returns number of spam logs read.
+
+=cut
+
+sub get_num_spam {
+  my $self = shift;
+  return $self->{num_spam};
+}
+
+=item $parser->do_score_ranges();
+
+Figure out range in which score can be set based on the soratio, etc.
+
+This is necessary so that the perceptron doesn't set silly
+scores. (This may not be as much of a problem as it was with the old
+GA.)
+
+This adds the following keys to the rules hashes:
+
+=over 4
+
+=item ismutable
+
+Determines whether the perceptron can select a score for this test.
+
+=item range_lo
+
+Determines the lowest score the perceptron can set.
+
+=item range_hi
+
+Determines the highest score the perceptron can set.
+
+=cut
+
+sub do_score_ranges() {
+
+  my $self = shift;
+
+  if ( !$self->{_statistics} ) {
+    $self->do_statistics();
+  }
+  if ( !$self->{_rank} ) {
+    $self->do_rank();
+  }
+
+  foreach my $rule (values %{$self->{rules}}) {
+
+    my ($rank, $lo, $hi);
+
+    $rank = $rule->{rank};
+
+    # Get rid of rules that don't hit -- and disable completely.
+    if ($rule->{spam_percent} + $rule->{ham_percent} < 0.01 ||
+	$rule->{score} == 0) {
+
+      $rule->{ismutable} = 0;
+      $rule->{range_lo} = $rule->{range_hi} = 0;
+      next;
+
+    }
+
+    # next: get rid of tests that don't apply in this scoreset
+    # or are userconf -- set ismutable to 0, but keep the score
+    if ($rule->{tflags} =~ /\buserconf\b/ ||
+	(($self->{scoreset} % 2) == 0 && $rule->{tflags} =~/\bnet\b/)) {
+
+      $rule->{ismutable} = 0;
+      $rule->{range_lo} = $rule->{range_hi} = $rule->{score};
+      next;
+
+    }
+
+
+    # Normal rules:
+
+    # This seems to convert from [-1,1] to [0,1] but we're already in
+    # [0,1] space - Is this right?
+
+    # The current way ranks are calculated, > 0.5 and < 0.5 have no
+    # special meaning
+
+#      # 0.0 = best nice, 1.0 = best nonnice
+#      if ($rule->{isnice}) {
+#        $rank = .5 - ($rank / 2);
+#      } else {
+#        $rank = .5 + ($rank / 2);
+#      }
+
+    # using this seems to work better
+
+    if($rule->{isnice}) {
+      $hi = 0;
+      $lo = $rule->{rank} * -4.5;
+    } else {
+      $hi = $rule->{rank} * 4.5;
+      $lo = 0
+    }
+
+     # Modify good rules to be lower
+     if ($rule->{isnice}) {
+       if ($rule->{tflags} =~ /\blearn\b/) { # learn rules should get
+                                             # higher scores (-5.4)
+ 	$lo *= 1.8;
+       }
+       elsif ( $rule->{soratio} <= 0.05 && $rule->{ham_percent} > 0.5) {
+ 	$lo *= 1.5;
+       }
+
+       # argh, ugly... but i'm copying it whole...
+       $hi =	($rule->{soratio} == 0) ? $lo :
+     		($rule->{soratio} <= 0.005 ) ? $lo/1.1 :
+     		($rule->{soratio} <= 0.010 && $rule->{ham_percent} > 0.2) ? $lo/2.0 :
+ 		($rule->{soratio} <= 0.025 && $rule->{ham_percent} > 1.5) ? $lo/10.0 :
+ 		0;
+
+       if ($rule->{soratio} >= 0.35 ) {
+ 	($lo, $hi) = (0,0);
+       }
+     }
+     else { # Make non-nice rules have higher scores if they're good
+       if ($rule->{tflags} =~ /\blearn\b/ ) {
+ 	$hi *= 1.8;
+       }
+       elsif ( $rule->{soratio} >= 0.99 && $rule->{spam_percent} > 1.0) {
+ 	$hi *= 1.5;
+       }
+
+       $lo =	($rule->{soratio} == 1) ? $hi:
+     		($rule->{soratio} >= 0.995 ) ? $hi/4.0 :
+     		($rule->{soratio} >= 0.990 && $rule->{spam_percent} > 1.0) ? $hi/8.0 :
+ 		($rule->{soratio} >= 0.900 && $rule->{spam_percent} > 10.0) ? $hi/24.0 :
+ 		0;
+
+       if ($rule->{soratio} <= 0.65 ) { # auto-disable bad rules
+ 	($lo, $hi) = (0,0);
+       }
+     }
+
+
+    # Some sanity checking
+    if($hi < $lo) {
+      ($lo, $hi) = ($hi, $lo);
+    }
+
+
+    $rule->{ismutable} = ($lo == $hi) ? 0 : 1;
+    $rule->{range_lo} = $lo;
+    $rule->{range_hi} = $hi;
+
+  }
+}
+
+
+# Pacify perl
+1;

Property changes on: masses
___________________________________________________________________
Name: svn:ignore
   - craig-evolve.scores
craig-evolve.scores.all
craig-evolve.scores.neg
craig-evolve.scores.pos
falseneg.log
falsepos.log
trueneg.log
truepos.log
spam*.log
ham*.log
nonspam*.log
newscores
analysis
spam.sonic
nonspam.sonic
spam.local
nonspam.local
commands.sh
scores.h
tests.h
tmp
old-random-search
result.*
results.*
overnight.*
galib245
pgapack
evolve.scores
evolve
craig-evolve
goodresults
freqs
logs
spam.dogma
nonspam.dogma
spam.alltime
PI*
nonspam.jm-traps
nonspam.log*
spam.jm-traps
spam.log*
dprof.*
tmon.*
spamassassin
spamassassin.prefs
jm-mass-check-rules

   + craig-evolve.scores
craig-evolve.scores.all
craig-evolve.scores.neg
craig-evolve.scores.pos
falseneg.log
falsepos.log
trueneg.log
truepos.log
spam*.log
ham*.log
nonspam*.log
newscores
analysis
spam.sonic
nonspam.sonic
spam.local
nonspam.local
commands.sh
scores.h
tests.h
tmp
old-random-search
result.*
results.*
overnight.*
galib245
pgapack
evolve.scores
evolve
craig-evolve
goodresults
freqs
logs
spam.dogma
nonspam.dogma
spam.alltime
PI*
nonspam.jm-traps
nonspam.log*
spam.jm-traps
spam.log*
dprof.*
tmon.*
spamassassin
spamassassin.prefs
jm-mass-check-rules
masses.log*
masses*.log
tenpass_results
perceptron.scores


Index: masses/parse-rules-for-masses
===================================================================
--- masses/parse-rules-for-masses	(revision 20231)
+++ masses/parse-rules-for-masses	(working copy)
@@ -1,148 +0,0 @@
-#!/usr/bin/perl
-#
-# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# </@LICENSE>
-
-sub usage {
-  die "
-parse-rules-for-masses: parse the SpamAssassin rules files for mass-checks,
-        evolving, and frequency analysis
-
-usage: ./parse-rules-for-masses [-d rulesdir] [-o outputfile] [-s scoreset]
-
-rulesdir defaults to ../rules
-outputfile defaults to ./tmp/rules.pl
-scoreset default to 0
-
-";
-}
-
-use Getopt::Long;
-use Data::Dumper;
-
-use vars qw(@rulesdirs $outputfile $scoreset);
-GetOptions (
-                "d=s" => \@rulesdirs,
-                "o=s" => \$outputfile,
-		"s=i" => \$scoreset,
-                "help|h|?" => sub { usage(); } );
-
-if ($#rulesdirs < 0) {
-  @rulesdirs = ("../rules");
-}
-
-if (!defined $outputfile) {
-  $outputfile = "./tmp/rules.pl";
-  mkdir ("tmp", 0755);
-}
-
-$scoreset = 0 if ( !defined $scoreset );
-
-my $rules = { };
-readrules(@rulesdirs);
-
-my $scores = { };
-foreach my $key (keys %{$rules}) {
-  $scores->{$key} = $rules->{$key}->{score};
-}
-
-writerules($outputfile);
-exit;
-
-sub readrules {
-  foreach my $indir (@_) {
-    my @files = <$indir/[0-9]*.cf>;
-    my $file;
-    %rulesfound = ();
-    %langs = ();
-    foreach $file (sort @files) {
-      open (IN, "<$file");
-      while (<IN>) {
-        s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;
-
-        my $lang = '';
-        if (s/^lang\s+(\S+)\s+//) {
-          $lang = $1;
-        }
-
-        if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) {
-          my $type = $1;
-          my $name = $2;
-
-          my $issubrule = '0';
-          if ($name =~ /^__/) { $issubrule = '1'; }
-
-          $rules->{$1} ||= { };
-          $rules->{$name}->{type} = $type;
-          $rules->{$name}->{lang} = $lang;
-          $rules->{$name}->{issubrule} = $issubrule;
-          $rules->{$name}->{tflags} = '';
-
-        } elsif (/^describe\s+(\S+)\s+(.+)$/) {
-          $rules->{$1} ||= { };
-          $rules->{$1}->{describe} = $2;
-
-        } elsif (/^tflags\s+(\S+)\s+(.+)$/) {
-          $rules->{$1} ||= { };
-          $rules->{$1}->{tflags} = $2;
-
-        } elsif (/^score\s+(\S+)\s+(.+)$/) {
-	  my($name,$score) = ($1,$2);
-          $rules->{$name} ||= { };
-	  if ( $score =~ /\s/ ) { # there are multiple scores
-	    ($score) = (split(/\s+/,$score))[$scoreset];
-	  }
-          $rules->{$name}->{score} = $score;
-        }
-      }
-      close IN;
-    }
-  }
-
-  foreach my $rule (keys %{$rules}) {
-    if (!defined $rules->{$rule}->{type}) {
-      delete $rules->{$rule};   # no rule definition -> no rule
-      next;
-    }
-
-    if (!defined $rules->{$rule}->{score}) {
-      my $def = 1.0;
-      if ($rule =~ /^T_/) { $def = 0.01; }
-
-      if ($rules->{$rule}->{tflags} =~ /nice/) {
-        $rules->{$rule}->{score} = -$def;
-      } else {
-        $rules->{$rule}->{score} = $def;
-      }
-    }
-  }
-}
-
-sub writerules {
-  my $outfile = shift;
-  # quick hack to create the tmp directory
-  system ("mkdir -p $outfile 2>/dev/null ; rmdir $outfile 2>/dev/null");
-
-  open (OUT, ">$outfile") or die "cannot write to $outfile";
-  print OUT "# dumped at ".`date`."\n";
-
-  $Data::Dumper::Purity = 1;
-  print OUT Data::Dumper->Dump ([$rules, $scores], ['*rules', '*scores']);
-
-  print OUT "1;";
-  close OUT;
-}
-
Index: masses/hit-frequencies
===================================================================
--- masses/hit-frequencies	(revision 20231)
+++ masses/hit-frequencies	(working copy)
@@ -16,385 +16,256 @@
 # limitations under the License.
 # </@LICENSE>
 
+
 use FindBin;
-use Getopt::Std;
-getopts("fm:M:X:l:L:pxhc:at:s:i");
+use lib "$FindBin::Bin/../lib";
+use Mail::SpamAssassin::Masses;
+use Getopt::Long qw(:config bundling auto_help);
+use Pod::Usage;
+use strict;
+use warnings;
 
+
 use vars qw {
   $opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c
-  $opt_a $opt_t $opt_s $opt_i $sorting
+  $opt_a $opt_t $opt_s $opt_z $opt_inclang $opt_auto
 };
 
-sub usage {
-  die "hit-frequencies [-c rules dir] [-f] [-m RE] [-M RE] [-X RE] [-l LC]
-                [-s SC] [-a] [-p] [-x] [-i] [spam log] [ham log]
+GetOptions("c|cffile=s@" => \$opt_c,
+	   "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
+	   "l|logfile=s" => \$opt_l,
+	   "f|falses" => \$opt_f,
+	   "a|all" => \$opt_a,
+	   "p|percentages" => \$opt_p,
+	   "x|extended" => \$opt_x,
+	   "m|matchrule=s" => \$opt_m, #,
+	   "t|tflags=s" => \$opt_t,
+	   "M|matchlog=s" => \$opt_M,
+	   "X|excludelog=s" => \$opt_X,
+	   "L|language=s" => \$opt_L,
+	   "include-language=s" => \$opt_inclang);
 
-    -c p   use p as the rules directory
-    -f     falses. count only false-negative or false-positive matches
-    -m RE  print rules matching regular expression
-    -t RE  print rules with tflags matching regular expression
-    -M RE  only consider log entries matching regular expression
-    -X RE  don't consider log entries matching regular expression
-    -l LC  also print language specific rules for lang code LC (or 'all')
-    -L LC  only print language specific rules for lang code LC (or 'all')
-    -a     display all tests
-    -p     percentages. implies -x
-    -x     extended output, with S/O ratio and scores
-    -s SC  which scoreset to use
-    -i     use IG (information gain) for ranking
 
-    options -l and -L are mutually exclusive.
+=head1 NAME
 
-    options -M and -X are *not* mutually exclusive.
+hit-frequencies - Display statistics about tests hit by a mass-check run
 
-    if either the spam or and ham logs are unspecified, the defaults
-    are \"spam.log\" and \"ham.log\" in the cwd.
+=head1 SYNOPSIS
 
-";
-}
+hit-frequencies [options]
 
-usage() if($opt_h || ($opt_l && $opt_L));
+ Options:
+    -c,--cffile=path	  Use path as the rules directory
+    -s,--scoreset=n	  Use scoreset n
+    -l,--logfile=file	  Read in file instead of masses.log
+    -f			  Count only false-positives/false-negatives
+    -a			  Report all tests (including subrules)
+    -p			  Report percentages instead of raw hits
+    -x			  "Extended" output, include RANK, S/O and SCORE
+    -m,--matchrule=re     Print rules matching the regular expression
+    -t,--tflags=re	  Print only rules with tflags matching the regular expression
+    -M,--matchlog=re      Consider only logs matching the regular expression
+    -X,--excludelog=re	  Exclude logs matching this regular expression
+    -L,--language=lc	  Only print language specific tests for specified lang code (try 'all')
+    --include-language=lc Also print language specific tests for specified lang code (try 'all')
 
-if ($opt_p) {
-  $opt_x = 1;
-}
+=head1 DESCRIPTION
 
-$opt_s = 0 if ( !defined $opt_s );
+B<hit-frequencies> will read the mass-check log F<masses.log> or the
+log given by the B<--logfile> option. The output will contain a
+summary of the number of ham and spam messages and detailed statistics
+for each rule. By default, B<hit-frequencies> will try to guess the
+proper values for B<--cffile> based on the header of the
+masses.log. The output will include the following columns:
 
-my $cffile = $opt_c || "$FindBin::Bin/../rules";
+=over 4
 
-my %freq_spam = ();
-my %freq_ham = ();
-my $num_spam = 0;
-my $num_ham = 0;
-my %ranking = ();
-my $ok_lang = '';
+=item OVERALL
 
-readscores($cffile);
+Number of times (or percentage with B<-p>) the rule hit on
+all messages (spam or ham).
 
-$ok_lang = lc ($opt_l || $opt_L || '');
-if ($ok_lang eq 'all') { $ok_lang = '.'; }
+=item SPAM
 
-foreach my $key (keys %rules) {
+Number of times (or percentage with B<-p>) the rule hit on
+spam messages.
 
-  if ( ($opt_L && !$rules{$key}->{lang}) ||
-       ($rules{$key}->{lang} &&
-         (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i)
-     ) ) {
-    delete $rules{$key} ; next;
-  }
+=item HAM
 
-  $freq_spam{$key} = 0;
-  $freq_ham{$key} = 0;
-}
+Number of times (or percentage with B<-p>) the rule hit on
+ham messages.
 
-readlogs();
+=item S/O
 
-my $hdr_all = $num_spam + $num_ham;
-my $hdr_spam = $num_spam;
-my $hdr_ham = $num_ham;
+Shown only with B<-x> or B<-p>, this is the number of spam hits
+divided by total number of hits (C<S/O> refers to spam divided by
+overall).
 
-if ($opt_p) {
-  my $sorting = $opt_i ? "IG" : "RANK";
-  if ($opt_f) {
-    printf "%7s %7s %7s  %6s  %6s  %6s  %s\n",
-  	"OVERALL%", "FNEG%", "FPOS%", "S/O", $sorting, "SCORE", "NAME";
-  } else {
-    printf "%7s %7s  %7s  %6s  %6s  %6s  %s\n",
-  	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
-  }
-  printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
-  	$hdr_all, $hdr_spam, $hdr_ham,
-        soratio ($num_spam,$num_ham), 0, 0;
+=item RANK
 
-  $hdr_spam = ($num_spam / $hdr_all) * 100.0;
-  $hdr_ham = ($num_ham / $hdr_all) * 100.0;
-  $hdr_all = 100.0;             # this is obvious
-  printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  (all messages as %%)\n",
-  	$hdr_all, $hdr_spam, $hdr_ham,
-        soratio ($num_spam,$num_ham), 0, 0;
+Shown only with B<-x> or B<-p>, this is a measure that attempts to
+indicate how I<good> or I<useful> a test is. The higher it is, the
+better the test.
 
-} elsif ($opt_x) {
-  printf "%7s %7s  %7s  %6s  %6s %6s  %s\n",
-  	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
-  printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  (all messages)\n",
-  	$hdr_all, $hdr_spam, $hdr_ham,
-        soratio ($num_spam,$num_ham), 0, 0;
+=item SCORE
 
-} else {
-  printf "%10s  %10s  %10s  %s\n",
-  	"OVERALL", "SPAM", "HAM", "NAME";
-  printf "%10d  %10d  %10d  (all messages)\n",
-  	$hdr_all, $hdr_spam, $hdr_ham;
-}
+Shown only with B<-x> or B<-p>, this is the current score assigned to
+the rule.
 
-my %done = ();
-my @tests = ();
-my $rank_hi = 0;
-my $rank_lo = 9999999;
+=item NAME
 
-# variables for wanted/unwanted RANK
-my %wanted;
-my %unwanted;
-my %wranks;
-my %uranks;
+This is the rule's name.
 
-foreach my $test (keys %freq_spam, keys %freq_ham) {
-  next unless (exists $rules{$test});           # only valid tests
-  next if (!$opt_a && $rules{$test}->{issubrule});
+=back
 
-  next if $done{$test}; $done{$test} = 1;
-  push (@tests, $test);
+=head1 BUGS
 
-  my $isnice = 0;
-  if ($rules{$test}->{tflags} =~ /nice/) { $isnice = 1; }
+Please report bugs to http://bugzilla.spamassassin.org/
 
-  my $fs = $freq_spam{$test}; $fs ||= 0;
-  my $fn = $freq_ham{$test}; $fn ||= 0;
-  my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
-  my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;
+=head1 SEE ALSO
 
-  my $soratio = $soratio{$test} = soratio ($fsadj, $fnadj);
+L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
 
-  if ($isnice) {
-    $soratio = 1.0 - $soratio;
-    my $tmp = $fsadj; $fsadj = $fnadj; $fnadj = $tmp;
-  }
+=cut
 
-  if ($opt_i) {
-    # come up with a ranking
-    my $rank;
-
-    # New new system: from "Learning to Filter Unsolicited Commercial E-Mail",
-    # Ion Androutsopoulos et al: determine the information gain IG(X, C) of the
-    # Boolean attributes (ie. the rules). Measures "the average reduction in
-    # the entropy of C (classification) given the value of X (the rule)". Makes
-    # a good ranking measure with a proper statistical basis. ;)
-    #
-    # Still would like to get an entropy measure in, too.
-    #
-    #             sum                                    P(X = x ^ C = c)
-    # IG(X,C) = x in [0, 1]    P(X = x ^ C = c) . log2( ------------------- )
-    #           c in [Ch, Cs]                           P(X = x) . P(C = c)
-    #
-    my $safe_nspam = $num_spam || 0.0000001;
-    my $safe_nham = $num_ham || 0.0000001;
-
-    my $num_all = ($num_spam + $num_ham);
-    my $safe_all = $num_all || 0.0000001;
-    my $f_all = $fs+$fn;
-
-    my $px0 = (($num_all - $f_all) / $safe_all);         # P(X = 0)
-    my $px1 = ($f_all / $safe_all);                      # P(X = 1)
-    my $pccs = ($num_spam / $safe_all);                  # P(C = Cs)
-    my $pcch = ($num_ham / $safe_all);                   # P(C = Ch)
-    my $px1ccs = ($fs / $safe_nspam);                   # P(X = 1 ^ C = Cs)
-    my $px1cch = ($fn / $safe_nham);                    # P(X = 1 ^ C = Ch)
-    my $px0ccs = (($num_spam - $fs) / $safe_nspam);     # P(X = 0 ^ C = Cs)
-    my $px0cch = (($num_ham - $fn) / $safe_nham);       # P(X = 0 ^ C = Ch)
-    my $safe_px0_dot_pccs = ($px0 * $pccs) || 0.00000001;
-    my $safe_px0_dot_pcch = ($px0 * $pcch) || 0.00000001;
-    my $safe_px1_dot_pccs = ($px1 * $pccs) || 0.00000001;
-    my $safe_px1_dot_pcch = ($px1 * $pcch) || 0.00000001;
-
-    sub log2 { return log($_[0]) / 0.693147180559945; } # log(2) = 0.6931...
-
-    my $safe_px0ccs = ($px0ccs || 0.0000001);
-    my $safe_px0cch = ($px0cch || 0.0000001);
-    my $safe_px1ccs = ($px1ccs || 0.0000001);
-    my $safe_px1cch = ($px1cch || 0.0000001);
-    $rank = ( $px0ccs * log2($safe_px0ccs / $safe_px0_dot_pccs) ) +
-                    ( $px0cch * log2($safe_px0cch / $safe_px0_dot_pcch) ) +
-                    ( $px1ccs * log2($safe_px1ccs / $safe_px1_dot_pccs) ) +
-                    ( $px1cch * log2($safe_px1cch / $safe_px1_dot_pcch) );
-
-    $ranking{$test} = $rank;
-    $rank_hi = $rank if ($rank > $rank_hi);
-    $rank_lo = $rank if ($rank < $rank_lo);
-  }
-  else {
-    # basic wanted/unwanted ranking
-    $wanted{$test} = $isnice ? $fn : $fs;
-    $unwanted{$test} = $isnice ? $fs : $fn;
-    # count number of ranks of each type
-    $wranks{$wanted{$test}} = 1;
-    $uranks{$unwanted{$test}} = 1;
-  }
+if ($opt_L && $opt_inclang) {
+  pod2usage("-L/--language and --include-language are mutually exclusive");
 }
 
-# finish basic wanted/unwanted ranking
-if (! $opt_i) {
-  my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
-  my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;
-
-  # first half of ranking is the wanted rank
-  my $position = 0;
-  my $last = undef;
-  for my $test (@wanted) {
-    $position++ if defined $last && $last != $wanted{$test};
-    $ranking{$test} += $position;
-    $last = $wanted{$test}
-  }
-
-  # second half of ranking is the unwanted rank
-  my $normalize = (scalar keys %wranks) / (scalar keys %uranks);
-  $position = 0;
-  $last = undef;
-  for my $test (@unwanted) {
-    $position++ if defined $last && $last != $unwanted{$test};
-    $ranking{$test} += ($position * $normalize);
-    $last = $unwanted{$test};
-    $rank_hi = $ranking{$test} if ($ranking{$test} > $rank_hi);
-    $rank_lo = $ranking{$test} if ($ranking{$test} < $rank_lo);
-  }
+if ($opt_p) {
+  $opt_x = 1;
 }
 
-{
-  # now normalise the rankings to [0, 1]
-  $rank_hi -= $rank_lo;
-  foreach $test (@tests) {
-    $ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi);
-  }
-}
+$opt_s = 0 if ( !defined $opt_s );
 
-foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) {
-  next unless (exists $rules{$test});           # only valid tests
-  next if (!$opt_a && $rules{$test}->{issubrule});
+my $ok_lang = lc ( $opt_inclang || $opt_L || '');
+$ok_lang = '.' if ($ok_lang eq 'all');
 
-  my $fs = $freq_spam{$test}; $fs ||= 0;
-  my $fn = $freq_ham{$test}; $fn ||= 0;
-  my $fa = $fs+$fn;
+my $greprules = sub { # To determine whether rule should be read
+  my ($name, $rule) = @_;
 
-  next if ($opt_m && $test !~ m/$opt_m/);	# match certain tests
-  next if ($opt_t && $rules{$test}->{tflags} !~ /$opt_t/); # match tflags
+  return 0 if ($opt_m && $name !~ /$opt_m/); # name doesn't match -m
+                                             # expression
+  return 0 if ($opt_t && $rule->{tflags} !~ /$opt_t/); # tflags don't
+                                                       # match -t
+                                                       # expression
+  return 0 if (($opt_L && !$rule->{lang}) ||
+	   ($rule->{lang} &&
+	    (!$ok_lang || $rule->{lang} !~ /^$ok_lang/i))); # Wrong language
 
+  return 0 if ($rule->{issubrule} && !$opt_a);
+
   if (!$opt_a && !$opt_t) {
-    next if ($rules{$test}->{tflags} =~ /net/ && ($opt_s % 2 == 0));   # not net tests
-    next if ($rules{$test}->{tflags} =~ /userconf/); # or userconf
+    return 0 if ($rule->{tflags} =~ /net/ && ($opt_s % 2 == 0));
+    return 0 if ($rule->{tflags} =~ /userconf/); # or userconf
   }
+  return 1;
 
-  # adjust based on corpora sizes (and cvt to % while we're at it)
-  my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
-  my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;
+};
 
-  if ($opt_f && $fsadj == 0 && $fnadj == 0) { next; }
 
-  if ($opt_p) {
-    $fa = ($fa / ($num_spam + $num_ham)) * 100.0;
-    $fs = $fsadj;
-    $fn = $fnadj;
-  }
+my $logfile = $opt_l || "masses.log";
 
-  my $soratio = $soratio{$test};
-  if (!defined $soratio) {
-    $soratio{$test} = soratio ($fsadj, $fnadj);
-  }
+if (!$opt_c || !scalar(@$opt_c)) {
+    # Try to read this in from the log, if possible
+    open IN, $logfile or die "Can't open $logfile: $!";
+    my $files = 0; # are we in the files section?
+    while(<IN>) {
+	if (!$files) {
+	    if (/^\# SVN revision:/) {
+		$opt_c = [ "$FindBin::Bin/../rules" ];
+		last;
+	    } elsif (/^\# Using configuration:$/) {
+		$files = 1;
+	    }
+	} elsif (/^\#\s+(.*)\s*$/) {
+	    push (@$opt_c, $1);
+	} else {
+	    # All done!
+	    last;
+	}
+    }
 
-  if ($opt_p) {
-    printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  %s\n",
-  	$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test;
+    if (!defined $opt_c) {
+      $opt_c = [ "$FindBin::Bin/../rules" ];
+    }
 
-  } elsif ($opt_x) {
-    printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  %s\n",
-  	$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test;
-
-  } else {
-    printf "%10d  %10d  %10d  %s\n", $fa, $fs, $fn, $test;
-  }
+    foreach my $file (@$opt_c) {
+	die "Can't read $file" unless -r $file;
+    }
 }
-exit;
+	    
+my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
+					       scoreset => $opt_s,
+                                               falsesonly => $opt_f,
+                                               greprules => $greprules,
+                                               logfile => $logfile,
+                                               nologs => 1});
 
+$masses->readrules();
+$masses->readlogs();
+$masses->do_statistics();
+$masses->do_rank();
 
+my $rules = $masses->get_rules_hash();
+my $num_ham = $masses->get_num_ham();
+my $num_spam = $masses->get_num_spam();
+my $num_all = $num_ham + $num_spam;
 
-sub readlogs {
-  my $spam = $ARGV[0] || "spam.log";
-  my $ham = $ARGV[1] || (-f "good.log" ? "good.log" : "ham.log");
+if ($num_ham + $num_spam <= 0) {
+  die "Can't run hit-frequencies on 0 messages.";
+}
 
-  foreach my $file ($spam, $ham) {
-    open (IN, "<$file") || die "Could not open file '$file': $!";
+## Write header
 
-    my $isspam = 0; ($file eq $spam) and $isspam = 1;
+if ($opt_p) {
 
-    while (<IN>) {
-      next if (/^#/);
-      next unless (!$opt_M || /$opt_M/o);
-      next if ($opt_X && /$opt_X/o);
+  if ($opt_f) {
+    printf "%7s %7s %7s  %6s  %6s  %6s  %s\n",
+  	"OVERALL%", "FNEG%", "FPOS%", "S/O", "RANK", "SCORE", "NAME";
+  } else {
+    printf "%7s %7s  %7s  %6s  %6s  %6s  %s\n",
+  	"OVERALL%", "SPAM%", "HAM%", "S/O", "RANK", "SCORE", "NAME";
+  }
 
-      /^(.)\s+(-?\d+)\s+(\S+)\s*(\S*)/ or next;
-      my $caught = ($1 eq 'Y');
-      my $hits = $2;
-      $_ = $4; s/,,+/,/g;
+  printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
+  	$num_all, $num_spam, $num_ham,
+        $num_spam / $num_all, 0, 0;
 
-      if ($isspam) {
-        if ($opt_f) {
-          if (!$caught) { $num_spam++; }
-        } else {
-          $num_spam++;
-        }
-      } else {
-        if ($opt_f) {
-          if ($caught) { $num_ham++; }
-        } else {
-          $num_ham++;
-        }
-      }
+  printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  (all messages as %%)\n",
+  	100.0, $num_spam / $num_all * 100.0, $num_ham / $num_all * 100.0,
+        $num_spam / $num_all, 0, 0;
 
-      my @tests = split (/,/, $_);
-      foreach my $t (@tests) {
-	next if ($t eq '');
-	if ($isspam) {
-          if ($opt_f) {
-            if (!$caught) { $freq_spam{$t}++; }
-          } else {
-            $freq_spam{$t}++;
-          }
-	} else {
-          if ($opt_f) {
-            if ($caught) { $freq_ham{$t}++; }
-          } else {
-            $freq_ham{$t}++;
-          }
-	}
-      }
-    }
-    close IN;
-  }
-}
+} elsif ($opt_x) {
+  printf "%7s %7s  %7s  %6s  %6s %6s  %s\n",
+  	"OVERALL", "SPAM", "HAM", "S/O", "RANK", "SCORE", "NAME";
+  printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  (all messages)\n",
+  	$num_all, $num_spam, $num_ham,
+        $num_spam / $num_all, 0, 0;
 
-
-sub readscores {
-  my($cffile) = @_;
-  system ("$FindBin::Bin/parse-rules-for-masses -d \"$cffile\" -s $opt_s") and die;
-  require "./tmp/rules.pl";
+} else {
+  printf "%10s  %10s  %10s  %s\n",
+  	"OVERALL", "SPAM", "HAM", "NAME";
+  printf "%10d  %10d  %10d  (all messages)\n",
+  	$num_all, $num_spam, $num_ham;
 }
 
-sub soratio {
-  my ($s, $n) = @_;
+foreach my $test (sort { $rules->{$b}->{rank} <=> $rules->{$a}->{rank} } keys %{$rules}) {
 
-  $s ||= 0;
-  $n ||= 0;
-
-  if ($s + $n > 0) {
-      return $s / ($s + $n);
+  if ($opt_p) {
+    printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  %s\n",
+  	($rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham}) / $num_all * 100.0,
+        $rules->{$test}->{spam_percent}, $rules->{$test}->{ham_percent},
+        $rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test;
+  } elsif ($opt_x) {
+    printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  %s\n",
+  	$rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham},
+        $rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham},
+        $rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test;
   } else {
-      return 0.5;		# no results -> not effective
+    printf "%10d  %10d  %10d  %s\n",
+        $rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham},
+        $rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham}, $test;
   }
 }
 
-sub tcr {
-  my ($nspam, $nlegit, $nspamspam, $nlegitspam) = @_;
-  my $nspamlegit = $nspam - $nspamspam;
-  my $nlegitlegit = $nlegit - $nlegitspam;
-
-  my $lambda = 99;
-
-  my $werr = ($lambda * $nlegitspam + $nspamlegit)
-                  / ($lambda * $nlegit + $nspam);
-
-  my $werr_base = $nspam
-                  / ($lambda * $nlegit + $nspam);
-
-  $werr ||= 0.000001;     # avoid / by 0
-  my $tcr = $werr_base / $werr;
-  return $tcr;
-}
Index: masses/perceptron.pod
===================================================================
--- masses/perceptron.pod	(revision 0)
+++ masses/perceptron.pod	(revision 0)
@@ -0,0 +1,30 @@
+=head1 NAME
+
+perceptron - Generate scores for SpamAssassin using the "Stochastic
+Gradient Method"
+
+=head1 SYNOPSIS
+
+perceptron [options]
+
+ Options:
+  -p ham_preference 	Modifies tendency to prefer false negatives over
+			false positives (default 2.0) (higher = less fp)
+  -e num_epochs		Set number of passes to make (default 15)
+  -l learning_rate	Modifies learning rate (default 2.0)
+  -w weight_decay 	Scores multiplied by this value after each pass
+			to prevent scores from getting too high
+			(default off (1.0))
+
+=head1 DESCRIPTION
+
+This algorithm is used to optimize SpamAssassin scores, based on the
+input given by B<logs-to-c>. At the time of writing, the output of
+logs-to-c needs to be compiled into the source before perceptron can
+be used, but this will be fixed soon, I hope.
+
+=head1 SEE ALSO
+
+L<logs-to-c(1)>
+
+=cut
Index: masses/rewrite-cf-with-new-scores
===================================================================
--- masses/rewrite-cf-with-new-scores	(revision 20231)
+++ masses/rewrite-cf-with-new-scores	(working copy)
@@ -16,32 +16,123 @@
 # limitations under the License.
 # </@LICENSE>
 
+=head1 NAME
+
+rewrite-cf-with-new-scores - Rewrite SpamAssassin scores file with new
+scores.
+
+=head1 SYNOPSIS
+
+rewrite-cf-with-new-scores [options]
+
+  Options
+  --old-scores=file    Read file containing the old SpamAssassin scores
+  --new-scores=file    Read file containing the new SpamAssassin scores
+  -s,--scoreset n      Rewrite scoreset n
+  --output=file        Output rewritten score file to file
+  -c,--cffile=path     Use path as the rules directory
+  -l,--logfile=file    Use file instead of masses.log (for guessing -c)
+
+ Note: these options can be shortened (i.e. --old, --new, --out) as
+ long as they are unambiguous.
+
+=head1 DESCRIPTION
+
+B<rewrite-cf-with-new-scores> is a tool to update the sitewide scores
+file with the newly generated scores. Since SpamAssassin has four
+different scoresets, which each need to be generated separately, this
+tool is used to only change the correct scoreset.
+
+By default, the old scores are read from 50_scores.cf in the rules
+directory and the new ones from ./perceptron.scores. The output will
+be ./50_scores.cf by default.
+
+The rules directory needs to be used to make sure scores are given for
+the right tests. Rules not found in the rules directory will not be
+given scores in the output.
+
+=head1 BUGS
+
+Please report bugs to http://bugzilla.spamassassin.org/
+
+=head1 SEE ALSO
+
+L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
+
+=cut
+
+use FindBin;
+use lib "$FindBin::Bin/../lib";
+use Getopt::Long qw(:config bundling auto_help);
+use Mail::SpamAssassin::Masses;
+use Pod::Usage;
+use strict;
+use warnings;
+
+use vars qw($opt_old $opt_new $opt_scoreset $opt_out $opt_c $opt_l);
+
+GetOptions("old-scores=s" => \$opt_old,
+	   "new-scores=s" => \$opt_new,
+	   "s|scoreset=i" => \$opt_scoreset,
+	   "output=s" => \$opt_out,
+	   "c|cffile=s@" => \$opt_c,
+	   "l|logfile=s" => \$opt_l);
+
+$opt_l ||= "masses.log";
+$opt_scoreset = 0 unless defined $opt_scoreset;
+
 my $NUM_SCORESETS = 4;
 
-my ($scoreset,$oldscores,$newscores) = @ARGV;
+if (!$opt_c || !scalar(@$opt_c)) {
+    # Try to read this in from the log, if possible
+    open IN, $opt_l or die "Can't open $opt_l: $!";
+    my $files = 0; # are we in the files section?
+    while(<IN>) {
+	if (!$files) {
+	    if (/^\# SVN revision:/) {
+		$opt_c = [ "$FindBin::Bin/../rules" ];
+		last;
+	    } elsif (/^\# Using configuration:$/) {
+		$files = 1;
+	    }
+	} elsif (/^\#\s+(.*)\s*$/) {
+	    push (@$opt_c, $1);
+	} else {
+	    # All done!
+	    last;
+	}
+    }
 
-$scoreset = int($scoreset) if defined $scoreset;
-if (!defined $newscores || $scoreset < 0 || $scoreset >= $NUM_SCORESETS ) {
-  die "usage: rewrite-cf-with-new-scores scoreset oldscores.cf newscores.cf\n";
+    if (!defined $opt_c) {
+      $opt_c = [ "$FindBin::Bin/../rules" ];
+    }
+
+    foreach my $file (@$opt_c) {
+	die "Can't read $file" unless -r $file;
+    }
 }
 
-system ("./parse-rules-for-masses -s $scoreset") and die;
-if (-e "tmp/rules.pl") {
-  # Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem
-  require "./tmp/rules.pl";
+if (!$opt_old) {
+  $opt_old = $$opt_c[0] . "/50_scores.cf";
 }
-else {
-  die "parse-rules-for-masses had no error but no tmp/rules.pl!?!";
-}
 
+$opt_new ||= "50_scores.cf";
+
+my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
+					       scoreset => $opt_scoreset});
+
+$masses->readrules();
+my $rules = $masses->get_rules_hash();
+
 # now read the generated scores
 my @gascoreorder = ();
+my %oldscores = ();
 my %gascorelines = ();
-open (STDIN, "<$newscores") or die "cannot open $newscores";
+open (STDIN, "<$opt_new") or die "cannot open $opt_new";
 while (<STDIN>) {
   /^score\s+(\S+)\s+(-?\d+(?:\.\d+)?)/ or next;
   my $name = $1;  my $score = $2;
-  next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);
+  next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule});
   next if ($name =~ /^__/);
   next if ($name eq '(null)');	# er, oops ;)
 
@@ -49,7 +140,7 @@
   push (@gascoreorder, $name);
 }
 
-open (IN, "<$oldscores") or die "cannot open $oldscores";
+open (IN, "<$opt_old") or die "cannot open $opt_old";
 my $out = '';
 my $pre = '';
 
@@ -58,7 +149,7 @@
 while (<IN>) {
   if (/^\s*score\s+(\S+)\s/) {
     delete $gascorelines{$1};
-    next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);
+    next unless (exists ($rules->{$1}) && $rules->{$1}->{issubrule} == 0);
   }
   $pre .= $_;
   /^# Start of generated scores/ and last;
@@ -82,10 +173,10 @@
   if (/^\s*score\s+\S+/) {
     my($score,$name,@scores) = split;
 
-    next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);
+    next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule});
     if (defined $gascorelines{$name}) {
       # Set appropriate scoreset value
-      $scores[$scoreset] = $gascorelines{$name};
+      $scores[$opt_scoreset] = $gascorelines{$name};
 
       # Create new score line
       $_ = join(" ","score",$name,generate_scores(@scores))."\n";
@@ -96,8 +187,10 @@
 }
 close IN;
 
+open OUT, ">$opt_out" or die "Can't open $opt_out: $!";
+
 # and output the lot
-print $pre, "\n";
+print OUT $pre, "\n";
 foreach my $name (@gascoreorder) {
   $_ = $gascorelines{$name};
   next unless (defined ($_));
@@ -107,12 +200,12 @@
   @scores = @{$oldscores{$name}} if ( exists $oldscores{$name} );
 
   # Set appropriate scoreset value
-  $scores[$scoreset] = $_;
+  $scores[$opt_scoreset] = $_;
 
   # Create new score line
-  print join(" ","score",$name,generate_scores(@scores)),"\n";
+  print OUT join(" ","score",$name,generate_scores(@scores)),"\n";
 }
-print "\n", $out, "\n";
+print OUT "\n", $out, "\n";
 
 sub generate_scores {
   my (@scores) = @_;
Index: masses/mboxget
===================================================================
--- masses/mboxget	(revision 20231)
+++ masses/mboxget	(working copy)
@@ -1,45 +0,0 @@
-#!/usr/bin/perl -w
-
-# mboxget - get a message from a mailbox
-#
-# usage: mboxget [mass-check-mbox-id ...]
-#
-# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# </@LICENSE>
-
-use strict;
-
-my $prog = $0;
-$prog =~ s@.*/@@;
-
-foreach my $where (@ARGV) {
-  my ($file, $offset) = ($where =~ m/(.*?)(?:\.(\d+))?$/);
-  open(INPUT, $file) || die("$prog: open $file failed: $!\n");
-  if ($offset) {
-    seek(INPUT, $offset, 0) || die("$prog: seek $offset failed: $!\n");
-  }
-  my $past = 0;
-  while (<INPUT>) {
-    if ($past) {
-      last if substr($_,0,5) eq "From ";
-    }
-    else {
-      $past = 1;
-    }
-    print $_;
-  }
-  close INPUT;
-}
Index: masses/rule-qa/corpus-nightly
===================================================================
--- masses/rule-qa/corpus-nightly	(revision 20231)
+++ masses/rule-qa/corpus-nightly	(working copy)
@@ -81,14 +81,13 @@
 date > test.end
 
 # results name
-mv spam.log spam-$net$username.log
-mv ham.log ham-$net$username.log
+mv masses.log masses-$net$username.log
 
 # rsync
 set +e
 retry=0
 while true; do
-	if rsync -CPcvuzb --timeout=120 spam-$net$username.log ham-$net$username.log $username@rsync.spamassassin.org::corpus/; then
+	if rsync -CPcvuzb --timeout=120 masses-$net$username.log $username@rsync.spamassassin.org::corpus/; then
 		break;
 	fi
 	if [ $retry -eq 120 ]; then
@@ -99,3 +98,4 @@
 	sleep 30
 done
 set -e
+
Index: masses/rule-qa/corpus-hourly
===================================================================
--- masses/rule-qa/corpus-hourly	(revision 20231)
+++ masses/rule-qa/corpus-hourly	(working copy)
@@ -92,7 +92,7 @@
     @files = sort readdir(CORPUS);
     closedir(CORPUS);
 
-    @files = grep { /^(?:spam|ham)-(?:net-)?\w+\.log$/ && -f "$opt{corpus}/$_" && -M _ < 10 } @files;
+    @files = grep { /^masses-(?:net-)?\w+\.log$/ && -f "$opt{corpus}/$_" && -M _ < 10 } @files;
     @files = grep {
 	my $time = 0;
 	my $tag = 0;
@@ -109,6 +109,7 @@
 	}
 	$time;
     } @files;
+
 }
 
 sub rename {
@@ -158,131 +159,96 @@
 
 	    next if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/);
 
-	    my @ham = grep { /^ham/ } @files;
-	    my @spam = grep { /^spam/ } @files;
+	    print STDERR "logs: " . join(' ', @files) . "\n";
 
-	    print STDERR "ham: " . join(' ', @ham) . "\n";
-	    print STDERR "spam: " . join(' ', @spam) . "\n";
-
 	    chdir $opt{corpus};
 
 	    # net vs. local
 	    if ($class eq "NET") {
-		@ham = grep { /-net-/ } @ham;
-		@spam = grep { /-net-/ } @spam;
-		print STDERR "ham: " . join(' ', @ham) . "\n";
-		print STDERR "spam: " . join(' ', @spam) . "\n";
+		@files = grep { /-net-/ } @files;
+		print STDERR "logs: " . join(' ', @files) . "\n";
 	    }
 	    else {
 		# if both net and local exist, use newer
-		my %spam;
-		my %ham;
 		
-		for my $file (@spam) {
-		    $spam{$1}++ if ($file =~ m/-(\w+)\.log$/);
+		for my $file (@files) {
+		    $logs{$1}++ if ($file =~ m/-(\w+)\.log$/);
 		}
-		for my $file (@ham) {
-		    $ham{$1}++ if ($file =~ m/-(\w+)\.log$/);
-		}
-		while (my ($user, $count) = each %ham) {
+		while (my ($user, $count) = each %logs) {
 		    if ($count > 1) {
-			my $nightly = "ham-$user.log";
-			my $weekly = "ham-net-$user.log";
+			my $nightly = "masses-$user.log";
+			my $weekly = "masses-net-$user.log";
 			if ($revision{$nightly} >= $revision{$weekly}) {
-			    @ham = grep { $_ ne $weekly } @ham;
+			    @files = grep { $_ ne $weekly } @files;
 			}
 			else {
-			    @ham = grep { $_ ne $nightly } @ham;
+			    @files = grep { $_ ne $nightly } @files;
 			}
 		    }
 		}
-		while (my ($user, $count) = each %spam) {
-		    if ($count > 1) {
-			my $nightly = "spam-$user.log";
-			my $weekly = "spam-net-$user.log";
-			if ($revision{$nightly} >= $revision{$weekly}) {
-			    @spam = grep { $_ ne $weekly } @spam;
-			}
-			else {
-			    @spam = grep { $_ ne $nightly } @spam;
-			}
-		    }
-		}
-		print STDERR "ham: " . join(' ', @ham) . "\n";
-		print STDERR "spam: " . join(' ', @spam) . "\n";
+		print STDERR "logs: " . join(' ', @files) . "\n";
 	    }
 	    
 	    # age
 	    if ($class eq "NET" && $age ne "7day") {
-		@ham = grep { -M "$_" < 10 } @ham;
-		@spam = grep { -M "$_" < 10 } @spam;
+		@files = grep { -M "$_" < 10 } @files;
 		# find most recent CVS revision
 		my $wanted = 0.0;
 		for (@spam, @ham) {
 		    $wanted = $revision{$_} if ($revision{$_} > $wanted);
 		}
-		@spam = grep { $revision{$_} eq $wanted } @spam;
-		@ham = grep { $revision{$_} eq $wanted } @ham;
-		print STDERR "ham: " . join(' ', @ham) . "\n";
-		print STDERR "spam: " . join(' ', @spam) . "\n";
+		@files = grep { $revision{$_} eq $wanted } @files;
+
+		print STDERR "logs: " . join(' ', @files) . "\n";
 	    }
 	    elsif ($age =~ /^(?:new|all|age)$/) {
-		@ham = grep { -M "$_" < -M $opt{tagtime} } @ham;
-		@spam = grep { -M "$_" < -M $opt{tagtime} } @spam;
-		@ham = grep { $revision{$_} eq $revision } @ham;
-		@spam = grep { $revision{$_} eq $revision } @spam;
-		print STDERR "ham: " . join(' ', @ham) . "\n";
-		print STDERR "spam: " . join(' ', @spam) . "\n";
+		@files = grep { -M "$_" < -M $opt{tagtime} } @files;
+
+		@files = grep { $revision{$_} eq $revision } @files;
+
+		print STDERR "logs: " . join(' ', @files) . "\n";
 	    }
 	    elsif ($age =~ /(\d+)day/) {
 		my $mtime = $1;
-		@ham = grep { -M "$_" < $mtime } @ham;
-		@spam = grep { -M "$_" < $mtime } @spam;
-		print STDERR "ham: " . join(' ', @ham) . "\n";
-		print STDERR "spam: " . join(' ', @spam) . "\n";
+		@files = grep { -M "$_" < $mtime } @files;
+
+		print STDERR "logs: " . join(' ', @files) . "\n";
 	    }
 	    
 	    open(OUT, "> $opt{html}/$class.$age");
-	    print OUT "# ham results used: " . join(" ", @ham) . "\n";
-	    print OUT "# spam results used: " . join(" ", @spam) . "\n";
-	    for (@ham) {
+	    print OUT "# results used: " . join(" ", @files) . "\n";
+
+	    for (@files) {
 		print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision;
 	    }
-	    for (@spam) {
-		print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision;
-	    }
 
 	    my $flags = "";
 	    $flags = "-t net -s 1" if $class eq "NET";
 	    $flags = "-M HTML_MESSAGE" if $class eq "HTML";
 
 	    if ($age eq "all") {
-		my %spam;
-		my %ham;
+		my %logs;
 		my @output;
 		
-		for my $file (@spam) {
-		    $spam{$1} = $file if ($file =~ m/-(\w+)\.log$/);
+		for my $file (@files) {
+		    $logs{$1} = $file if ($file =~ m/-(\w+)\.log$/);
 		}
-		for my $file (@ham) {
-		    $ham{$1} = $file if ($file =~ m/-(\w+)\.log$/);
-		}
-		unlink "$opt{tmp}/ham.log.$$";
-		unlink "$opt{tmp}/spam.log.$$";
-		next unless (scalar keys %spam && scalar keys %ham);
-		for my $user (sort keys %spam) {
-		    next unless defined $ham{$user};
+
+		unlink "$opt{tmp}/masses.log.$$";
+
+		next unless (scalar keys %logs);
+		for my $user (sort keys %logs) {
+
 		    chdir "$opt{tree}/masses";
-		    system("cat $opt{corpus}/$ham{$user} >> $opt{tmp}/ham.log.$$");
-		    system("cat $opt{corpus}/$spam{$user} >> $opt{tmp}/spam.log.$$");
-		    open(IN, "./hit-frequencies -xpa $flags $opt{corpus}/$spam{$user} $opt{corpus}/$ham{$user} |");
+		    system("cat $opt{corpus}/$logs{$user} >> $opt{tmp}/masses.log.$$");
+		    open(IN, "./hit-frequencies -xpa $flags -l $opt{corpus}/$logs{$user} |");
 		    while(<IN>) {
 			chomp;
 			push @output, "$_:$user\n";
 		    }
 		    close(IN);
 		}
-		open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
+		open(IN, "./hit-frequencies -xpa $flags -l $opt{tmp}/masses.log.$$ |");
 		while(<IN>) {
 		    push @output, $_;
 		}
@@ -298,21 +264,20 @@
 		    my ($after, $before) = split(/-/, $which);
 		    # get and filter logs
 		    chdir $opt{corpus};
-		    for my $type (("ham", "spam")) {
-			open(TMP, "> $opt{tmp}/$type.log.$$");
-			my @array = ($type eq "ham") ? @ham : @spam;
-			for my $file (@array) {
-			    open(IN, $file);
-			    while (<IN>) {
-				print TMP $_ if time_filter($after, $before);
-			    }
-			    close(IN);
-			}
-			close (TMP);
+
+		    open(TMP, "> $opt{tmp}/masses.log.$$");
+		    for my $file (@files) {
+		      open(IN, $file);
+		      while (<IN>) {
+			print TMP $_ if time_filter($after, $before);
+		      }
+		      close(IN);
 		    }
+		    close (TMP);
+
 		    # print out by age
 		    chdir "$opt{tree}/masses";
-		    open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
+		    open(IN, "./hit-frequencies -xpa $flags -l $opt{tmp}/masses.log.$$ |");
 		    while(<IN>) {
 			chomp;
 			push @output, "$_:$which\n";
@@ -323,13 +288,11 @@
 		    print OUT $_;
 		}
 	    }
-	    elsif (@ham && @spam) {
+	    elsif (@files) {
 		# get logs
-		system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$");
-		system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$");
-	
+		system("cat " . join(" ", @files) . " > $opt{tmp}/masses.log.$$");
 		chdir "$opt{tree}/masses";
-		open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
+		open(IN, "./hit-frequencies -xpa $flags -l $opt{tmp}/masses.log.$$ |");
 		while(<IN>) {
 		    print(OUT);
 		}
Index: masses/README.user
===================================================================
--- masses/README.user	(revision 0)
+++ masses/README.user	(revision 0)
@@ -0,0 +1,375 @@
+
+HOW TO GENERATE YOUR OWN SCORES FOR SPAMASSASSN
+-----------------------------------------------
+
+Duncan Findlay
+<duncf@debian.org>
+
+
+1. Introduction
+
+One of the reasons SpamAssassin is so accurate is that it's scores are
+carefully optimized based on collections (aka. corpus, plural:
+corpora) of mail from volunteers all across the world. Each volunteer
+uses a script ("mass-check") to run SpamAssassin over each piece of
+mail in their corpus. They then submit the results to a central server
+where the SpamAssassin development team runs the scoring mechanism to
+generate optimal scores.
+
+SpamAssassin uses four different scoresets depending on the options
+used. These are almost always referred to by number, as shown below:
+
+Scoreset
+   0	 - Network tests disabled, Bayes disabled
+   1     - Network tests enabled, Bayes disabled
+   2     - Network tests disabled, Bayes enabled
+   3     - Network tests enabled, Bayes enabled
+
+Things are further complicated by the fact that when Bayes is enabled,
+it automatically learns using the equivalent scoreset with Bayes
+disabled. As a result, optimal scores for scoresets 2 and 3 can only
+be generated after scoresets 0 and 1. Set 0 logs can be generated from
+set 1 logs, but sets 2 and 3 need to be done separately.
+
+As a result, volunteers who take part in our rescoring survey need to
+run 3 mass-checks, each of which can take many hours. Since the
+generation of scores is such a labourious process, the SpamAssassin
+developers only perform this once per release.
+
+Luckily, the previous score optimizer, a Genetic Algorithm, which took
+almost 24 hours to optimize scores for one scoreset has been replaced
+with the Perceptron (thanks to Henry Stern) which uses a "Stochastic
+Gradient Descent" method. Don't worry if you don't understand what
+this means, I certainly don't. The Perceptron takes less than 15
+seconds to generate scores of roughly equal quality as the GA.
+
+
+2. Compiling a Corpus
+
+The first step to generating your own scores it to start collecing
+mail, both ham (non-spam) and spam. These should be representative of
+all the mail you receive, but you should filter out spam related
+lists, like spamassassin-users to avoid skewing results. It is
+essential that these corpora be very well classified. It will greatly
+reduce the effectiveness of your scores if spam mails get misfiled
+into your ham folder and vice versa.
+
+Also, it is important to note that SpamAssassin is not designed to be
+a virus filter, so it's best if you filter out viurses from your ham
+and spam folders too.
+
+Furthermore, since spam and ham characteristics change over time, it's
+best to leave out mail over 6 months. This is especially important for
+network tests, since these are designed to stop current spam, and are
+not historical records.
+
+I'm not entirely sure how big corpora should be. The bigger, the
+better. If your corpus is too small, it may not be sufficiently
+representative of all the mail you receive, and accuracy will
+suffer. My corpus of mail for the last 6 months is over 55000 messages
+(35000 spam, 20000 ham).
+
+
+3. Mass-check
+
+Now that you've assembled your corpora, you need to use mass-check to
+test each message with SpamAssassin. This script is surprisingly fast,
+as it accesses the internal perl libraries of SpamAssassin, without
+the need to load a new perl process each time (as you would if you
+piped each message through spamassassin). Doing a scoreset 2 run (no
+network, bayes enabled) I get roughly 10,000 messages an hour on an
+unloaded Pentium 4, 2.80Ghz computer with 512 MB RAM.
+
+By default, if you are not running out of an unpacked source tree,
+mass-check will read rules from the usual locations. As a result, you
+should make sure ~/.spamassassin/user_prefs contains no rules, unless
+you are planning on using your generated scores for only yourself, not
+sitewide.
+
+The first step is to define the locations of all of the messages in
+your corpora (these are known as "targets"). I find it's easiest to
+put this in a separate file with line of the following format:
+
+class:format:location
+
+Class is either "spam" or "ham", format is "mbox", "file", "dir" or
+"mbx" and location is the path to the mailbox. mass-check supports
+using * as a wildcard, so the following target is permitted:
+
+spam:mbox:/home/duncf/Maildir/Old/spam/*
+
+Once you have placed all the "targets" necessary for your corpora, run
+mass-check with the following command.
+
+mass-check -f file
+
+If you doing a mass-check run for scoreset 1 or 3 (i.e. network tests
+enabled) you will also need to add the --net option, and you will want
+to add -j8 (or some other number) to indicate how many messages to
+test in parallel. This is useful since a lot of time would otherwise
+be spent waiting for network queries to return.
+
+mass-check will generate a log file in the current directory entitled
+masses.log. This is the log file that will enable us to optimize
+scores.
+
+For the impatient: if you're one of those people who want to know
+exactly how far mass-check has gotten through your mail, use the
+--showdots option.
+
+
+4. Checking the quality of your corpora (a.k.a. Pulling Weeds)
+
+In order to ensure that your corpora don't contain misfiled mails, it
+is good to double check the highest scoring hams and lowest scoring
+spams.
+
+First check ham mail:
+
+grep "^h" masses.log | sort -rn -k2,2 | head -20
+
+If you want to read the corresponding messages try piping to
+extract-message-from-mbox -m (see the extract-message-from-mbox
+section for more detail).
+
+Do the same with spam mail:
+
+grep "^s" masses.log | sort -n -k2,2 | head -20
+
+
+5. extract-message-from-mbox
+
+extract-message-from-mbox takes a mbox filename and a byte offset and
+outputs the corresponding mail message. With the -m option, mass-check
+output (i.e. lines from masses.log) is read from the standard
+input. Without, arguments are expected to be in the form
+<mbox>.<offset> (i.e. /path/to/mbox.12345)
+
+The -h option can also be used to only show message headers.
+
+As shown above, it is quite useful to pipe portions of masses.log to
+extract-message-from-mbox.
+
+
+6. hit-frequencies
+
+hit-frequencies doesn't really help you advance toward your goal of
+optimizing scores, but it is very useful in evaluating locally created
+rules. Run it, look at it's output; you'll find it intersting (and if
+not, feel free to skip to the next section).
+
+hit-frequencies -x -p -s <scoreset>
+
+hit-frequencies (and many other scripts) are set to automatically
+guess where to find your configuration files based on
+masses.log. Unfortunately, it isn't perfect (actually it's a rather
+crude hack, but that's irrelevant). You may have to check masses.log
+to figure out where it's searching and/or add --cffile options (you
+can specify multiple paths using multiple --cffile options).
+
+hit-frequencies -x -p generates the following output:
+
+OVERALL%   SPAM%     HAM%     S/O    RANK   SCORE  NAME
+  64008    40932    23076    0.639   0.00    0.00  (all messages)
+100.000  63.9483  36.0517    0.639   0.00    0.00  (all messages as %)
+ 10.382  16.2342   0.0000    1.000   1.00    3.10  FORGED_MUA_OUTLOOK
+  8.266  12.9263   0.0000    1.000   0.99    1.00  FORGED_OUTLOOK_TAGS
+  6.484  10.1388   0.0000    1.000   0.98    4.50  DRUGS_ERECTILE_OBFU
+[...]
+
+The first two rows show the size of the corpora and their ham/spam
+break down. The following lines list each rule found and give various
+statistics about it based on your masses.log.
+
+OVERALL% represents the percentage of total messages (spam and ham)
+that the rule hits, SPAM% and HAM% show the percentages on each
+corpus. S/O is the SPAM% divided by the OVERALL%. Generally good
+(non-nice) rules have S/O's over 0.95, while nice (negative-scoring)
+rules generally have S/O's less than 0.5. RANK is a human readable
+indicator of how good a rule is. The higher the better, always. RANK
+is designed to be a rough indicator of the score the perceptron is
+likely to give it. SCORE is simply the current score. (This is simply
+listed for convenience, not calculated in any way.)
+
+If you do any rule development locally, you will find this is a great
+tool. If you come up with some great rules (that we haven't already
+thought of), please send us a patch at
+http://bugzilla.spamassassin.org/.
+
+
+7. lint-rules-from-freqs
+
+This script is designed to read in your masses.log and the
+SpamAssassin configuration files in order to find both bad syntax and
+bad rules that hit few messages or (with -f) have too many false
+positives/negatives, etc.
+
+lint-rules-from-freqs -f -s <scoreset>
+
+As with hit-frequencies, it tries to be smart with choosing the right
+--cffile options.
+
+This script is roughly the equivalent of running a spamassassin --lint
+and running a hit-frequencies to determine which tests have bad S/O
+ratios.
+
+
+8. logs-to-c
+
+logs-to-c is the program that converts a mass-check log into code that
+can be easily used by the perceptron. Currently, it is necessary to
+use the output of logs-to-c to even compile perceptron, but that
+should hopefully change in the near future.
+
+The files logs-to-c create need to be in the tmp/ sub-directory of the
+directory where perceptron.c is.
+
+logs-to-c -o tmp/ -s <scoreset>
+
+These files contain information about each rule such as whether or not
+the perceptron is permitted to change the rule's score, the range
+within which the perceptron can adjust it, whether or not a rule is
+nice, etc. In addition, these files contain information about each
+mail hit and which tests were hit. The files generated by logs-to-c
+are not really easy to read, so don't try; use hit-frequencies
+instead.
+
+
+9. perceptron
+
+perceptron is the brains behind the whole process. (And we must of
+course thank the brain behind perceptron, Henry Stern, for his
+contribution.)
+
+While the perceptron takes options for things such as "ham
+preference", "number of epochs", "learning rate" and "weight decay",
+it's probably best to trust the defaults; unless of course you want to
+try to find the optimum parameters (and post them to
+http://bugzilla.spamassassin.org/ with your evidence).
+
+The perceptron is incredibly quick. So start it, wait 15 seconds and
+voila, your optimized scores are ready. The output is in
+perceptron.scores.
+
+Unfortunately, it needs to be built from source every time you want to
+use it with a different masses.log or set of rules. In the directory
+containing perceptron.c, try:
+
+make perceptron
+./perceptron
+
+If you don't have the Makefile, try
+gcc -g -O2 -Wall -o perceptron perceptron.c -lm
+./perceptron
+
+
+10. rewrite-cf-with-new-scores
+
+perceptron dumps its results in perceptron.scores. Great. How does
+that help you? rewrite-cf-with-new-scores takes care of changing the
+old configuration files to correspons with the new scores. The script
+takes into account rules found in your configuration, so make sure
+that the --cffile argument is right (it'll read this from masses.log
+by default). The syntax is:
+
+rewrite-cf-with-new-scores --old 50_scores.cf --new perceptron.scores \
+  --out 50_scores.new.cf -l masses.log -s 2
+
+Make sure you don't forget the -s option. You need to tell it which
+scoreset to update or it'll update set 0, which is not what you want
+(unless you just did a set 0 run, of course).
+
+Note: the statistics in the new scores file are NOT updated. Just the
+scores are.
+
+11. fp-fn-statistics
+
+This script calculates how good the scores are ata given threshold. It
+returns the number of false positives, false negatives, true
+positives, true negatives and a whole variety of fun statistics.
+
+./fp-fn-statistics -s <scoreset> --cffile <path>
+
+fp-fn-statistics also generates a TCR which is essentially an overall
+rating of how good the scores are. (This is only accurate when run on
+a different corpus of mail than that with which the scores were
+generated). TCR stands for "Total Cost Ratio". The higher the number,
+the better the set of scores.
+
+
+12. Submitting corpora for SpamAssassin
+
+If you want to contribute your mass-check logs to the SpamAssassin
+rescoring process, please download the latest revision of SpamAssassin
+from the subversion repository. See this page of the wiki:
+http://wiki.spamassassin.org/DownloadFromSvn
+
+You will want to read CORPUS_POLICY and CORPUS_SUBMIT. We only do
+large rescoring runs just before releases, so be sure to follow the
+lists which will have more information and reminders on how to
+participate.
+
+Please be sure your corpora are of high quality (everything must be
+carefully checked to avoid misfilings). Also, we appreciate varied
+sources of mail.
+
+
+13. Other scripts
+
+Only a subset of the scripts used in rule development and scoring have
+been documented here. Most of the others aren't really very
+useful. You can examine the others by downloading the source from the
+subversion repository: http://wiki.spamassassin.org/DownloadFromSvn.
+Everything relating to rule QA and development is in the masses/
+sub-directory.
+
+The scripts presented here have had man pages written for them, and an
+attempt has been made to standardize the options for ease of use. Many
+of the others may require some reading of source to understand how
+they work and what they do.
+
+
+14. Frequently Asked Questions
+
+(Since this is the first version of this document, I'm guessing what
+questions would otherwise be asked. So this isn't really a "Frequently
+Asked Questions" list, but a "What did Duncan fail to address
+elsewhere?" list.)
+
+Q. Why don't the scripts automatically guess which scoreset to use like
+they do with --cffile?
+
+A. Firstly, mass-check does not know what scoreset
+you are running. It could guess, but it probably shouldn't. Secondly,
+the same masses.log can be used for multiple scoresets (a set 1 log
+can be used to generate scores for sets 0 and 1, by stripping out net
+rules etc.)
+
+Q. How can I determine how good the scoring system is?
+
+A. There is a series of scripts in the source directory (in
+masses/tenpass/) designed to determine how accurate the perceptron is
+by using "10-fold Cross Validation" (10fcv). Basically, the masses.log
+is split into 10 "buckets" and each bucket is sequentially used to
+validate against scores generated from the remaining 9.
+
+
+15. Bugs, author, improvements, etc.
+
+SpamAssassin is written and maintained by a group of developers, whose
+names can be found in the CREDITS file.
+
+If you have further questions about SpamAssassin or the rescoring
+scripts, try the following:
+
+- Ask on one of the SpamAssassin mailing lists:
+
+http://www.spamassassin.org/lists.html
+
+- If you've found a bug, file a report:
+
+http://bugzilla.spamassassin.org/
+
+- Also, check out our wiki:
+
+http://wiki.spamassassin.org/
Index: masses/runGA
===================================================================
--- masses/runGA	(revision 20231)
+++ masses/runGA	(working copy)
@@ -1,47 +1,43 @@
 #!/bin/sh
 
 SCORESET="0"
+if [ "x$1" != "x" ] ; then
+    SCORESET=$1
+fi
+
 NAME="set$SCORESET"
+BASE="logs"
 
-if [ ! -f "ORIG/ham-$NAME.log" -o ! -f "ORIG/spam-$NAME.log" ]; then
+if [ ! -f "ORIG/masses-$NAME.log" ]; then
 	echo "Couldn't find logs for $NAME" >&2
 	exit 1
 fi
 
-if [ "x$1" = "x" ]; then
+if [ "x$2" = "x" ]; then
 echo "[Doing a scoreset $SCORESET score-generation run]"
 
 # Clean out old runs
 echo "[Cleaning up]"
-rm -rf spam-validate.log nonspam-validate.log ham-validate.log spam.log nonspam.log ham.log NSBASE SPBASE tmp make.output freqs perceptron.scores \
-	gen-$NAME.out gen-$NAME.scores gen-$NAME.validate
+
+rm -rf masses-validate.log masses.log $BASE tmp make.output freqs \
+    perceptron.scores gen-$NAME.out gen-$NAME.scores gen-$NAME.validate
 make clean >/dev/null
 
 # Generate 90/10 split logs
 echo "[Generating 90/10 split ham]"
-mkdir NSBASE SPBASE
-cd NSBASE
-../tenpass/split-log-into-buckets 10 < ../ORIG/ham-$NAME.log > /dev/null
-cat split-[1-9].log > nonspam.log
+mkdir $BASE
+cd $BASE
+../tenpass/split-log-into-buckets 10 < ../ORIG/masses-$NAME.log > /dev/null
+cat split-[1-9].log > masses.log
 rm -f split-[1-9].log
-mv split-10.log nonspam-validate.log
+mv split-10.log masses-validate.log
 
-echo "[Generating 90/10 split spam]"
-cd ../SPBASE
-../tenpass/split-log-into-buckets 10 < ../ORIG/spam-$NAME.log > /dev/null
-cat split-[1-9].log > spam.log
-rm -f split-[1-9].log
-mv split-10.log spam-validate.log
 cd ..
 
 echo "[Setting up for gen run]"
 # Ok, setup for a run
-ln -s SPBASE/spam.log .
-ln -s NSBASE/nonspam.log .
-ln -s NSBASE/nonspam.log ham.log
-ln -s SPBASE/spam-validate.log .
-ln -s NSBASE/nonspam-validate.log .
-ln -s NSBASE/nonspam-validate.log ham-validate.log
+ln -s $BASE/masses.log .
+ln -s $BASE/masses-validate.log .
 
 # try to find number of processors
 numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1`
@@ -57,11 +53,12 @@
 
 else
 
+echo "Make sure 50_scores.cf has been replaced appropriately"
+
 # This needs to have 50_scores.cf in place first ...
 echo "[gen validation results]"
-./logs-to-c --spam=SPBASE/spam-validate.log \
-	--nonspam=NSBASE/nonspam-validate.log \
-	--count --cffile=../rules --scoreset=$SCORESET | tee gen-$NAME.validate
+./fp-fn-statistics --logfile=BASE/masses-validate.log \
+	--cffile=../rules --scoreset=$SCORESET | tee gen-$NAME.validate
 
 echo "[STATISTICS file generation]"
 ./mk-baseline-results $SCORESET | tee gen-$NAME.statistics
Index: masses/lint-rules-from-freqs
===================================================================
--- masses/lint-rules-from-freqs	(revision 20231)
+++ masses/lint-rules-from-freqs	(working copy)
@@ -16,124 +16,214 @@
 # limitations under the License.
 # </@LICENSE>
 
+=head1 NAME
+
+lint-rules-from-freqs - Try to find problems with SpamAssassin rules
+
+=head1 SYNOPSIS
+
+lint-rules-from-freqs [options]
+
+ Options:
+    -c,--cffile=path	  Use path as the rules directory
+    -s,--scoreset=n	  Use scoreset n
+    -l,--logfile=file	  Read in file instead of masses.log
+    -f			  Also take into account false positives/negatives
+
+=head1 DESCRIPTION
+
+This script analyzes SpamAssassin tests, based on the hit frequencies
+and S/O ratios from a mass-check log (masses.log).  This script can
+also optionally take into account the false positive/negative
+frequencies.
+
+The script first uses the SpamAssassin rules parser to report on any
+illegal syntax. Then it checks the rules match frequencies from the
+mass-check log in order to determine how effective the rule is.
+
+=head1 BUGS
+
+Please report bugs to http://bugzilla.spamassassin.org/
+
+=head1 SEE ALSO
+
+L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
+
+=cut
+
+
+use FindBin;
+use lib "$FindBin::Bin/../lib";
+use Mail::SpamAssassin::Masses;
+use Mail::SpamAssassin;
+use Getopt::Long qw(:config bundling auto_help);
+use strict;
+use warnings;
+
 # any tests that get less than this % of matches on *both* spam or nonspam, are
 # reported.
 my $LOW_MATCHES_PERCENT = 0.03;
-my $scoreset = 0;
 
-sub usage {
-  die "
-lint-rules-from-freqs: perform 'lint' testing on SpamAssassin rules and scores
+use vars qw($opt_c $opt_l $opt_s $opt_f $opt_p);
 
-usage: ./lint-rules-from-freqs [-f falsefreqs] < freqs > badtests
+GetOptions("c|cffile=s@" => \$opt_c,
+	   "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
+	   "l|logfile=s" => \$opt_l,
+	   "f|falses" => \$opt_f);
 
-This analyzes SpamAssassin tests, based on the hit frequencies and S/O ratios
-from a mass-check logfile pair.
 
-The 'freqs' argument is the frequency of hits in all messages ('hit-frequencies
--x -p' output).
+$opt_s = 0 unless defined $opt_s;
+$opt_l ||= "masses.log";
 
-The 'falsefreqs' argument is frequencies of hits in false-positives and
-false-negatives only ('hit-frequencies -x -p -f' output).
+if (!$opt_c || !scalar(@$opt_c)) {
+    # Try to read this in from the log, if possible
+    open IN, $opt_l or die "Can't open $opt_l: $!";
+    my $files = 0; # are we in the files section?
+    while(<IN>) {
+	if (!$files) {
+	    if (/^\# SVN revision:/) {
+		$opt_c = [ "$FindBin::Bin/../rules" ];
+		last;
+	    } elsif (/^\# Using configuration:$/) {
+		$files = 1;
+	    }
+	} elsif (/^\#\s+(.*)\s*$/) {
+	    push (@$opt_c, $1);
+	} else {
+	    # All done!
+	    last;
+	}
+    }
 
-";
-}
+    if (!defined $opt_c) {
+      $opt_c = [ "$FindBin::Bin/../rules" ];
+    }
 
-my $opt_falsefreqs;
-while ($#ARGV >= 0) {
-  $_ = shift @ARGV;
-  if (/^-f/) { $_ = shift @ARGV; $opt_falsefreqs = $_; }
-  elsif (/^-s/) { $_ = shift @ARGV; $scoreset = $_; }
-  else { usage(); }
+    foreach my $file (@$opt_c) {
+	die "Can't read $file" unless -r $file;
+    }
 }
 
 print "BAD TESTS REPORT\n";
-readrules();
-print "\n" .((scalar keys %rulefile) + 1). " rules found.\n";
+# First, do a --lint
+
 print "\nRule file syntax issues:\n\n";
-lintrules();
 
-if ($opt_falsefreqs) {
-  open (FALSE, "<$opt_falsefreqs");
-  while (<FALSE>) {
-    if (!/^\s*([\d\.]+)/) {
-      my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
-      next unless ($name =~ /\S/);
-      $falsefreqs_spam{$name} = $spam;
-      $falsefreqs_nons{$name} = $nons;
-      $falsefreqs_so{$name} = $so;
+{
+  local (*STDERR) = \*STDOUT; # Get lint errors on STDOUT
+
+  # Read the config ourselves...
+
+  # Read init.pre from each directory, then glob for the rest.
+
+  my $cf_txt = '';
+  my @files;
+  my @dirs;
+  foreach my $file (@$opt_c) {
+    if (-d $file) {
+      if  (-r "$file/init.pre") {
+	push @files, "$file/init.pre";
+      }
+      push @dirs, $file;
     }
+    else {
+      push @files, $file;
+    }
   }
-  close FALSE;
-}
+  foreach my $dir (@dirs) {
+    my @cfs = glob("$dir/*.cf");
+    push @files, grep { -r $_ } @cfs;
+  }
 
-while (<>) {
-  if (!/^\s*([\d\.]+)/) {
-    $output{'a_header'} = $_; next;
+  foreach my $file (@files) {
+    if (-r $file) {
+      open IN, $file;
+      $cf_txt .= "file start $file\n";
+      $cf_txt .= join('', <IN>);
+      $cf_txt .= "\nfile end $file\n";
+      close IN;
+    }
   }
 
+  my $spamtest = new Mail::SpamAssassin({config_text => $cf_txt});
+
+  $spamtest->lint_rules();
+}
+
+
+# Next, check for other stuff
+my $masses = Mail::SpamAssassin::Masses->new({rulesdir => $opt_c,
+					      scoreset => $opt_s, #,,
+					      falses => $opt_f,
+					      logfile => $opt_l});
+
+$masses->readlogs();
+$masses->do_statistics();
+
+my $rules = $masses->get_rules_array();
+
+
+my %output;
+
+foreach my $rule (@$rules) {
+
   my $badrule;
-  my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
-  next unless ($name =~ /\S/);
 
-  my $ffspam = $falsefreqs_spam{$name};
-  my $ffnons = $falsefreqs_nons{$name};
-  my $ffso = $falsefreqs_so{$name};
+  next if ($rule->{tflags} =~ /\bnet\b/ && ($opt_s % 2) == 0);
+  next if ($rule->{tflags} =~ /\buserconf\b/);
 
-  my $tf = $tflags{$name};
-  next if ($tf =~ /net/ && ($scoreset % 2) == 0);
-  next if ($tf =~ /userconf/);
+  if ($rule->{freq_spam} == 0 && $rule->{freq_ham} == 0) {        # sanity!
 
-  if ($overall == 0.0 && $spam == 0.0 && $nons == 0.0) {        # sanity!
     $badrule = 'no matches';
 
   } else {
-    if ($score < 0.0) {
+    if ($rule->{score} < 0.0) {
       # negative score with more spams than nonspams? bad rule.
-      if ($tf !~ /nice/ && $so > 0.5 && $score < 0.5) {
+      if (!$rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) {
         $badrule = 'non-nice but -ve score';
       }
-
-      if ($tf =~ /nice/ && $so > 0.5 && $score < 0.5) {
-        if ($ffso < 0.5) {
+      if ($rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) {
+        if ($opt_f && $rule->{freq_fn} < $rule->{freq_fp}) {
           $badrule = 'fn';
-        } else {
-          # ignore, the FNs are overridden by other tests so it doesn't
-          # affect the overall results.
         }
+        # else {
+        # ignore, the FNs are overridden by other tests so it doesn't
+        # affect the overall results.
+        # }
       }
 
       # low number of matches overall
-      if ($nons < $LOW_MATCHES_PERCENT) 
+      if ($rule->{ham_percent} < $LOW_MATCHES_PERCENT)
                  { $badrule ||= ''; $badrule .= ', low matches'; }
 
-    } elsif ($score > 0.0) {
+    } elsif ($rule->{score} > 0.0) {
       # positive score with more nonspams than spams? bad.
-      if ($tf =~ /nice/ && $so < 0.5 && $score > 0.5) {
+      if ($rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) {
         $badrule = 'nice but +ve score';
       }
-
-      if ($tf !~ /nice/ && $so < 0.5 && $score > 0.5) {
-        if ($ffso > 0.5) {
+ 
+      if (!$rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) {
+        if ($opt_f && $rule->{freq_fp} > $rule->{freq_fn}) {
           $badrule = 'fp';
-        } else {
-          # ignore, the FPs are overridden by other tests so it doesn't
-          # affect the overall results.
         }
+        # else {
+        # ignore, the FPs are overridden by other tests so it doesn't
+        # affect the overall results.
+        # }
       }
-
+ 
       # low number of matches overall
-      if ($spam < $LOW_MATCHES_PERCENT) 
+      if ($rule->{spam_percent} < $LOW_MATCHES_PERCENT)
                  { $badrule ||= ''; $badrule .= ', low matches'; }
-
-    } elsif ($score == 0.0) {
+ 
+    } elsif ($rule->{score} == 0.0) {
       $badrule = 'score is 0';
     }
   }
-
+ 
   if (defined $badrule) {
-    $badrule =~ s/^, //; chomp;
-    $output{$badrule} .= $_ . " ($badrule)\n";
+    $badrule =~ s/^, //;
+    $output{$badrule} .= $rule->{name} . " ($badrule)\n";
   }
 }
 
@@ -156,182 +246,3 @@
 exit;
 
 
-sub concat_rule_lang {
-  my $rule = shift;
-  my $lang = shift;
-
-  if (defined $lang && $lang ne '') {
-    return "[$lang]_$rule";
-  } else {
-    return $rule;
-  }
-}
-
-# note: do not use parse-rules-for-masses here, we need to do linting instead
-# of your average parse
-sub readrules {
-  my @files = <../rules/[0-9]*.cf>;
-  my $file;
-  %rulesfound = ();
-  %langs = ();
-  foreach $file (@files) {
-    open (IN, "<$file");
-    while (<IN>) {
-      s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;
-
-      # make all the foo-bar stuff foo_bar
-      1 while s/^(\S+)-/\1_/g;
-      1 while s/^(lang\s+\S+\s+\S+)-/\1_/g;
-
-      my $lang = '';
-      if (s/^lang\s+(\S+)\s+//) {
-        $lang = $1; $langs{$1} = 1;
-      }
-
-      if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) {
-        $rulesfound{$2} = 1;
-        $rulefile{$2} ||= $file;
-        $scorefile{$1} = $file;
-        $score{$2} ||= 1.0;
-        $tflags{$2} ||= '';
-        $descfile{$2} ||= $file;       # a rule with no score or desc is OK
-	$description{$2}->{$lang} = undef;
-
-        if (/^body\s+\S+\s+eval:/) {
-          # ignored
-        } elsif (/^body\s+\S+\s+(.*)$/) {
-          my $re = $1;
-
-	  # If there's a ( in a rule where it should be (?:, flag it.
-	  # but ignore [abc(] ...
-          if ($re =~ /[^\\]\([^\?]/ && $re !~ /\[[^\]]*[^\\]\(/) { 
-            print "warning: non-(?:...) capture in regexp in $file: $_\n";
-          }
-          if ($re =~ /\.[\*\+]/) { 
-            print "warning: .* in regexp in $file: $_\n";
-          }
-          if ($re =~ /[^\\]\{(\d*),?(\d*?)\}/) {
-            if ($1 > 120 || $2 > 120) {
-              print "warning: long .{n} in regexp in $file: $_\n";
-            }
-          }
-        }
-
-      } elsif (/^describe\s+(\S+)\s+(.*?)\s*$/) {
-        $rulesfound{$1} = 1;
-        $descfile{concat_rule_lang ($1, $lang)} ||= $file;
-        $descfile{$1} ||= $file;
-	$description{$1}->{$lang} = $2;
-      } elsif (/^tflags\s+(\S+)\s+(.+)$/) {
-        $rulesfound{$1} = 1;
-        $tflags{$1} = $2;
-        $tflagsfile{concat_rule_lang ($1, $lang)} = $file;
-        $tflagsfile{$1} = $file;
-      } elsif (/^score\s+(\S+)\s+(.+)$/) {
-        $rulesfound{$1} = 1;
-        $scorefile{concat_rule_lang ($1, $lang)} = $file;
-        $scorefile{$1} = $file;
-        $score{$1} = $2;
-      } elsif (/^(clear_report_template|clear_spamtrap_template|report|spamtrap|
-                clear_terse_report_template|terse_report|
-                required_score|ok_locales|ok_languages|test|lang|
-                spamphrase|whitelist_from|require_version|
-		clear_unsafe_report_template|unsafe_report|
-		(?:bayes_)?auto_learn_threshold_nonspam|(?:bayes_)?auto_learn_threshold_spam|
-		(?:bayes_)?auto_learn
-                )/x) {
-        next;
-      } else {
-        print "warning: unknown rule in $file: $_\n";
-      }
-    }
-    close IN;
-  }
-  @langsfound = sort keys %langs;
-  @rulesfound = sort keys %rulesfound;
-}
-
-sub lintrules {
-  my %possible_renames = ();
-
-  foreach my $rule (@rulesfound) {
-    my $match = $rule;
-    $match =~ s/_\d+[^_]+$//gs;    # trim e.g. "_20K"
-    $match =~ s/[^A-Z]+//gs;    # trim numbers etc.
-
-    if (defined ($rulefile{$rule}) && $possible_renames{$match} !~ / \Q$rule\E\b/) {
-      $possible_renames{$match} .= " ".$rule;
-    }
-    $possible_rename_matches{$rule} = $match;
-  }
-
-  foreach my $lang ('', @langsfound) {
-    foreach my $baserule (@rulesfound) {
-      next if ( $baserule =~ /^__/ || $baserule =~ /^T_/ );
-
-      my $rule = concat_rule_lang ($baserule, $lang);
-      my $f = $descfile{$rule};
-      my $warned = '';
-
-      if (defined $f && !defined ($rulefile{$rule})
-                && !defined ($rulefile{$baserule}))
-      {
-        print "warning: $baserule has description, but no rule: $f\n";
-        $warned .= ' lamedesc';
-      }
-
-	# Check our convention for rule length
-	if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && length $baserule > 22 ) {
-	  print "warning: $baserule has a name longer than 22 chars: $f\n";
-	}
- 	# Check our convention for rule length
-	if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && defined $description{$baserule}->{$lang} && length $description{$baserule}->{$lang} > 50 ) {
-	  print "warning: $baserule has a description longer than 50 chars: $f\n";
-	}
-
-      # lang rule trumps normal rule
-      $f = $rulefile{$rule} || $rulefile{$baserule};
-      # if the rule exists, and the language/rule description doesn't exist ...
-      if ( defined $f && !defined $description{$baserule}->{$lang} )
-      {
-        print "warning: $baserule exists, ",( $lang ne '' ? "lang $lang, " : "" ),"but has no description: $f\n";
-        $warned .= ' lamedesc';
-      }
-
-
-      $f = $scorefile{$rule};
-      if (defined $f && !defined ($rulefile{$rule})
-                && !defined ($rulefile{$baserule}))
-      {
-        print "warning: $baserule has score, but no rule: $f\n";
-        $warned .= ' lamescore';
-      }
-
-      my $r = $possible_rename_matches{$rule};
-      if ($warned ne '' && defined $r) {
-        my @matches = split (' ', $possible_renames{$r});
-        if (scalar @matches != 0) {
-          my $text = '';
-
-          # now try and figure out "nearby" rules with no description/score
-          foreach my $baser (@matches) {
-            my $blang;
-            if ($descfile{$rule} =~ /text_(\S\S)\./) {
-              $blang = $1;
-            }
-            my $r = concat_rule_lang ($baser, $blang);
-            #warn "$r $descfile{$r} $descfile{$baser}";
-            next if ($warned =~ /lamedesc/ && (defined $descfile{$r}));
-            next if ($warned =~ /lamescore/ && (defined $scorefile{$r}));
-            $text .= " $baser";
-          }
-
-          if ($text ne '') {
-            print "warning: (possible renamed rule? $text)\n";
-          }
-        }
-      }
-    }
-  }
-}
-
Index: masses/Makefile
===================================================================
--- masses/Makefile	(revision 20231)
+++ masses/Makefile	(working copy)
@@ -3,34 +3,27 @@
 LDFLAGS=	-lm
 
 # What rule scoreset are we using?
-SCORESET =	0
+SCORESET =	3
+LOGFILE =	masses.log
 
 #### Should be no need to modify below this line
 
 all: badrules perceptron
 
 perceptron: perceptron.o
-	$(CC) -o perceptron perceptron.o $(LDFLAGS)
+	$(CC) -o perceptron perceptron.o $(LDFLAGS) 
 
-perceptron.o: tmp/rules.pl tmp/tests.h tmp/scores.h
+perceptron.o: tmp/tests.h
 	$(CC) $(CFLAGS) -c -o perceptron.o perceptron.c
 
-tmp/rules.pl: tmp/.created parse-rules-for-masses
-	perl parse-rules-for-masses -d ../rules -s $(SCORESET)
+tmp/tests.h: tmp/.created logs-to-c
+	perl logs-to-c --scoreset=$(SCORESET) --logfile=$(LOGFILE)
 
-tmp/tests.h: tmp/.created tmp/ranges.data logs-to-c
-	perl logs-to-c --scoreset=$(SCORESET)
+freqs: masses.log
+	perl hit-frequencies -x -p -s $(SCORESET) --logfile=$(LOGFILE) > freqs
 
-tmp/scores.h: tmp/tests.h
-
-tmp/ranges.data: tmp/.created freqs score-ranges-from-freqs
-	perl score-ranges-from-freqs ../rules $(SCORESET) < freqs
-
-freqs: spam.log ham.log
-	perl hit-frequencies -x -p -s $(SCORESET) > freqs
-
 badrules: freqs
-	perl lint-rules-from-freqs < freqs > badrules
+	perl lint-rules-from-freqs -s $(SCORESET) --logfile=$(LOGFILE) > badrules
 
 tmp/.created:
 	-mkdir tmp
Index: masses/mass-check
===================================================================
--- masses/mass-check	(revision 20231)
+++ masses/mass-check	(working copy)
@@ -16,144 +16,251 @@
 # limitations under the License.
 # </@LICENSE>
 
-sub usage {
-  die <<ENDOFUSAGE;
-usage: mass-check [options] target ...
- 
-  -c=file       set configuration/rules directory
-  -p=dir        set user-prefs directory
-  -f=file       read list of targets from <file>
-  -j=jobs       specify the number of processes to run simultaneously
-  --net         turn on network checks!
-  --mid         report Message-ID from each message
-  --debug       report debugging information
-  --progress    show progress updates during check
-  --rewrite=OUT save rewritten message to OUT (default is /tmp/out)
-  --showdots    print a dot for each scanned message
-  --rules=RE    Only test rules matching the given regexp RE
-  --restart=N   restart all of the children after processing N messages
-  --deencap=RE  Extract SpamAssassin-encapsulated spam mails only if they
-                were encapsulated by servers matching the regexp RE
-                (default = extract all SpamAssassin-encapsulated mails)
- 
-  log options
-  -o            write all logs to stdout
-  --loghits     log the text hit for patterns (useful for debugging)
-  --loguris	log the URIs found
-  --hamlog=log  use <log> as ham log ('ham.log' is default)
-  --spamlog=log use <log> as spam log ('spam.log' is default)
- 
-  message selection options
-  -n            no date sorting or spam/ham interleaving
-  --after=N     only test mails received after time_t N (negative values
-                are an offset from current time, e.g. -86400 = last day)
-                or after date as parsed by Time::ParseDate (e.g. '-6 months')
-  --before=N    same as --after, except received times are before time_t N
-  --all         don't skip big messages
-  --head=N      only check first N ham and N spam (N messages if -n used)
-  --tail=N      only check last N ham and N spam (N messages if -n used)
- 
-  simple target options (implies -o and no ham/spam classification)
-  --dir         subsequent targets are directories
-  --file        subsequent targets are files in RFC 822 format
-  --mbox        subsequent targets are mbox files
-  --mbx         subsequent targets are mbx files
- 
-  Just left over functions we should remove at some point:
-  --bayes       report score from Bayesian classifier
- 
-  non-option arguments are used as target names (mail files and folders),
-  the target format is: <class>:<format>:<location>
-  <class>       is "spam" or "ham"
-  <format>      is "dir", "file", "mbx", or "mbox"
-  <location>    is a file or directory name.  globbing of ~ and * is supported
+=head1 NAME
 
-ENDOFUSAGE
-}
+mass-check - Generates SpamAssassin scores and results for large
+amounts of mail
 
+=head1 SYNOPSIS
+
+ mass-check [options] class:format:location ...
+ mass-check [options] {--dir | --file | --mbox} target ...
+ mass-check [options] -f file
+
+  Options:
+    -f=file       read list of targets from <file>
+    -j=jobs       specify the number of processes to run simultaneously
+    --net         turn on network checks!
+    --mid         report Message-ID from each message
+    --debug       report debugging information
+    --progress    show progress updates during check
+    --rewrite=OUT save rewritten message to OUT (default is /tmp/out)
+    --showdots    print a dot for each scanned message
+    --rules=RE    Only test rules matching the given regexp RE
+    --restart=N   restart all of the children after processing N messages
+
+    SpamAssassin options
+    -c=dir        set configuration/rules directory
+    -p=file       set user preferences file (default: none)
+    -s=dir        set site rules configuration directory
+    -u=dir        set user-state directory
+    --dist        assumes the script is being run from the masses/ dir of
+                  the unpacked tarball, and makes appropriate guesses for
+                  -p and -c
+    --deencap=RE  Extract SpamAssassin-encapsulated spam mails only if they
+                  were encapsulated by servers matching the regexp RE
+                  (default = extract all SpamAssassin-encapsulated mails)
+
+    log options
+    -o            write all logs to stdout
+    --loghits     log the text hit for patterns (useful for debugging)
+    --loguris	  log the URIs found
+    --log=file    log to <file> (masses.log is default)
+
+    message selection options
+    -n            no date sorting or spam/ham interleaving
+    --after=N     only test mails received after time_t N (negative values
+                  are an offset from current time, e.g. -86400 = last day)
+                  or after date as parsed by Time::ParseDate (e.g. '-6 months')
+    --before=N    same as --after, except received times are before time_t N
+    --all         don't skip big messages
+    --head=N      only check first N ham and N spam (N messages if -n used)
+    --tail=N      only check last N ham and N spam (N messages if -n used)
+
+    simple target options (implies -o and no ham/spam classification)
+    --dir         subsequent targets are directories
+    --file        subsequent targets are files in RFC 822 format
+    --mbox        subsequent targets are mbox files
+    --mbx         subsequent targets are mbx files
+
+    Just left over functions we should remove at some point:
+    --bayes       report score from Bayesian classifier
+    --hamlog=log  use <log> as ham log ('ham.log' is default)
+    --spamlog=log use <log> as spam log ('spam.log' is default)
+
+=head1 DESCRIPTION
+
+B<mass-check> is designed to assist with rule development and
+generation of SpamAssassin scored. It reads in mail from the
+location(s) specified on the command line (in the first form above),
+given in the form I<class:format:location>, where I<class> is either
+"spam" or "ham" (non-spam), I<format> is one of "dir" (Maildirs, MH,
+etc), "file", "mbox" (mboxes can be gzipped) or "mbx".
+
+B<mass-check> will analyze each message using SpamAssassin and
+generate one-line of output per message, (by default to masses.log) in
+the following format:
+
+ {s|h} {s|h} score filename tests-hit
+
+The first field is the message's class as given on the command line
+(ham or spam). The second is the message's class as determined by
+SpamAssassin. The third is the message's score, as determined by
+SpamAssassin. The fourth field contains the message's filename; for
+mboxes, this contains the filename and the byte offset from the
+beginning of the file separated by a period. The last field contains a
+list of all the tests the message hit separated by commas.
+
+If you want to run this on the currently installed version of
+SpamAssassin's rules for sitewide use, make sure your user_prefs file
+contains no rules.
+
+=head1 BUGS
+
+Please report bugs to http://bugzilla.spamassassin.org/
+
+=head1 SEE ALSO
+
+L<hit-frequencies(1)>, L<logs-to-c(1)>, L<Mail::SpamAssassin::Masses(3)>,
+L<perceptron(1)>
+
+=cut
+
 ###########################################################################
 
-use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all $opt_bayes
-	    $opt_debug $opt_format $opt_hamlog $opt_head $opt_loghits
-	    $opt_mid $opt_mh $opt_ms $opt_net $opt_nosort $opt_progress
-	    $opt_showdots $opt_spamlog $opt_tail $opt_rules $opt_restart
-	    $opt_loguris $opt_after $opt_before $opt_rewrite $opt_deencap);
+use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all
+	    $opt_bayes $opt_before $opt_debug $opt_dist $opt_format
+	    $opt_hamlog $opt_head $opt_log $opt_loghits $opt_mid
+	    $opt_mh $opt_ms $opt_net $opt_nosort $opt_p $opt_progress
+	    $opt_s $opt_showdots $opt_spamlog $opt_tail $opt_rules
+	    $opt_restart $opt_loguris $opt_after $opt_rewrite $opt_u
+	    $opt_deencap);
 
 use FindBin;
 use lib "$FindBin::Bin/../lib";
 eval "use bytes";
 use Mail::SpamAssassin::ArchiveIterator;
 use Mail::SpamAssassin;
-use Getopt::Long;
+use Getopt::Long qw(:config bundling auto_help);
+use Pod::Usage;
 use POSIX qw(strftime);
 use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };
+use strict; # Why wasn't this on?
 use Config;
 
 # default settings
-$opt_c = "$FindBin::Bin/../rules";
-$opt_p = "$FindBin::Bin/spamassassin";
+
 $opt_j = 1;
 $opt_net = 0;
-$opt_hamlog = "ham.log";
-$opt_spamlog = "spam.log";
+$opt_log = "masses.log";
 
-GetOptions("c=s", "p=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug",
-	   "hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net",
-	   "progress", "rewrite:s", "showdots", "spamlog=s", "tail=i",
-	   "rules=s", "restart=i", "after=s", "before=s", "loguris", "deencap=s",
+GetOptions("c|cffile=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug",
+	   "deencap=s", "dist!", "hamlog=s", "head=i", "log=s",
+	   "loghits", "mh", "mid", "ms", "net", "p=s", "progress",
+	   "rewrite:s", "s=s", "showdots", "spamlog=s", "tail=i",
+	   "rules=s", "restart=i", "u=s", "after=s", "loguris",
 	   "dir" => sub { $opt_format = "dir"; },
-	   "file" => sub { $opt_format = "file"; },
+	   "file" => sub {$opt_format = "file"; },
 	   "mbox" => sub { $opt_format = "mbox"; },
 	   "mbx" => sub { $opt_format = "mbx"; },
-	   '<>' => \&target) or usage();
+	   '<>' => \&target);
 
+if ($opt_hamlog || $opt_spamlog) { # Old style logging
+  $opt_hamlog ||= "ham.log";
+  $opt_spamlog ||= "spam.log";
+}
+
+my @targets;
+
 if ($opt_f) {
   open(F, $opt_f) || die $!;
   push(@targets, map { chomp; $_ } <F>);
   close(F);
 }
 
-if (scalar @targets == 0) { usage(); }
+if (scalar @targets == 0) { pod2usage("No target defined!"); }
 
-#if ($opt_ms) {
-#find_missed($opt_spamlog);
-#}
-#elsif ($opt_mh) {
-#find_missed($opt_hamlog);
-#}
+# Auto-detect --dist option
+if (!defined $opt_dist) {
+  if (-f "$FindBin::Bin/../spamassassin.raw") {
+    warn "Automatically using --dist. Assuming you are running from the unpacked tarball. Use --no-dist to override.";
+    $opt_dist = 1;
+  }
+}
 
-$spamtest = new Mail::SpamAssassin ({
-  'debug'              			=> $opt_debug,
-  'rules_filename'     			=> $opt_c,
-  'userprefs_filename' 			=> "$opt_p/user_prefs",
-  'site_rules_filename'			=> "$opt_p/local.cf",
-  'userstate_dir'     			=> "$opt_p",
-  'save_pattern_hits'  			=> $opt_loghits,
-  'dont_copy_prefs'   			=> 1,
-  'local_tests_only'   			=> $opt_net ? 0 : 1,
-  'only_these_rules'   			=> $opt_rules,
-  'ignore_safety_expire_timeout'	=> 1,
-  PREFIX				=> '',
-  DEF_RULES_DIR        			=> $opt_c,
-  LOCAL_RULES_DIR      			=> '',
-});
+my $local_rules_dir;
 
+if ($opt_dist) { # Set defaults
+  $opt_c ||= "$FindBin::Bin/../rules";
+  $opt_p ||= "$FindBin::Bin/mass-check.cf";
+  $opt_u ||= "$FindBin::Bin/spamassassin";
+  $opt_s ||= "$FindBin::Bin/spamassassin";
+  $local_rules_dir = '';
+}
+else {
+  if(!$opt_u) {
+    # Assuming this is OK, since mass-check isnt supported on windows, is it?
+    # Also, should there be some check to make sure that previous mass-check stuff isn't in there?
+    # AFAICT, there isn't otherwise....
+    if ( -d "${ENV{HOME}}/.spamassassin" ) {
+      $opt_u = "${ENV{HOME}}/.spamassassin/mass-check";
+      warn "$opt_u already exists -- may contain files that will effect the results" if (-d $opt_u);
+      mkdir $opt_u, 0700 if (! -d $opt_u);
+    }
+  }
+
+# Leave the rest to SA, we'll get it afterwards
+
+}
+
+
+$opt_s =~ s/~/$ENV{HOME}/ if $opt_s;
+$opt_c =~ s/~/$ENV{HOME}/ if $opt_c;
+$opt_p =~ s/~/$ENV{HOME}/ if $opt_p;
+$opt_u =~ s/~/$ENV{HOME}/ if $opt_u;
+
+
+my $spamtest = new Mail::SpamAssassin ({
+				       'debug'              			=> $opt_debug,
+				       'rules_filename'     			=> $opt_c,
+				       'userprefs_filename' 			=> $opt_p,
+				       'site_rules_filename'			=> $opt_s,
+				       'userstate_dir'     			=> $opt_u,
+				       'save_pattern_hits'  			=> $opt_loghits,
+				       'dont_copy_prefs'   			=> 1,
+				       'local_tests_only'   			=> $opt_net ? 0 : 1,
+				       'only_these_rules'   			=> $opt_rules,
+				       'ignore_safety_expire_timeout'	=> 1,
+				       DEF_RULES_DIR        			=> $opt_c,
+				       LOCAL_RULES_DIR      			=> $local_rules_dir,
+				      });
+
 $spamtest->compile_now(1);
-$spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf");
+if ($opt_dist) {
+  $spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf");
+}
 
 my $who   = `id -un 2>/dev/null`;   chomp $who;
 my $where = `uname -n 2>/dev/null`; chomp $where;
 my $when  = `date -u`;              chomp $when;
-my $revision = "unknown";
-if (open(TESTING, "$opt_c/70_testing.cf")) {
-  chomp($revision = <TESTING>);
-  $revision =~ s/.*\$Rev:\s*(\S+).*/$1/;
-  close(TESTING);
+my $revision;
+
+if ($opt_dist) {
+  my $rev = "unknown";
+  if (open(TESTING, "$opt_c/70_testing.cf")) {
+    chomp($rev = <TESTING>);
+    $rev =~ s/.*\$Rev:\s*(\S+).*/$1/;
+    close(TESTING);
+  }
+  $revision = "SVN revision: $rev";
 }
+else {
+  $revision = "Local";
+}
+
 my $log_header = "# mass-check results from $who\@$where, on $when\n" .
 		 "# M:SA version ".$spamtest->Version()."\n" .
-		 "# SVN revision: $revision\n" .
+		 "# $revision\n" .
 		 "# Perl version: $] on $Config{archname}\n";
+
+if (!$opt_dist) {
+  my @paths = ( $spamtest->{rules_filename}, $spamtest->{site_rules_filename}, $spamtest->{userprefs_filename} );
+  $log_header .= "# Using configuration:\n";
+  foreach my $file (@paths) {
+    $log_header .=  "# $file\n";
+  }
+}
+
 my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || `hostname` || 'localhost';
 chomp $host;
 
@@ -222,7 +329,7 @@
     autoflush STDOUT 1;
     print STDOUT $log_header;
   }
-  else {
+  elsif ($opt_hamlog || $opt_spamlog) {
     open(HAM, "> $opt_hamlog");
     open(SPAM, "> $opt_spamlog");
     autoflush HAM 1;
@@ -230,6 +337,11 @@
     print HAM $log_header;
     print SPAM $log_header;
   }
+  else {
+    open(OUT, "> $opt_log");
+    autoflush OUT 1;
+    print OUT $log_header;
+  }
   $init_results = 1;
 }
 
@@ -239,25 +351,36 @@
   # don't open results files until we get here to avoid overwriting files
   &init_results if !$init_results;
 
-  if ($class eq "s") {
-    if ($opt_o) { print STDOUT $result; } else { print SPAM $result; }
-    $spam_count++;
+  if ($opt_o) {
+    print STDOUT $result;
   }
-  elsif ($class eq "h") {
-    if ($opt_o) { print STDOUT $result; } else { print HAM $result; }
-    $ham_count++;
+  elsif ($opt_spamlog || $opt_hamlog) {
+    if ($class eq "s") {
+      print SPAM $result;
+    } else {
+      print HAM $result;
+    }
   }
+  else {
+    print OUT $result;
+  }
 
   $total_count++;
 #warn ">> result: $total_count $class $time\n";
 
   if ($opt_progress) {
+    if ($class eq "s") {
+      $spam_count++;
+    }
+    else {
+      $ham_count++;
+    }
     progress($time);
   }
 }
 
 sub wanted {
-  my (undef, $id, $time, $dataref) = @_;
+  my ($class, $id, $time, $dataref) = @_;
   my $out;
 
   my $ma = $spamtest->parse($dataref, 1);
@@ -308,18 +431,22 @@
     push(@extra, "mid=$mid");
   }
 
-  my $yorn;
+  my $result;
   my $score;
   my $tests;
   my $extra;
 
   if ($opt_loguris) {
-    $yorn = '.';
+    $result = '.';
     $score = 0;
     $tests = join(" ", sort @uris);
     $extra = '';
   } else {
-    $yorn = $status->is_spam() ? 'Y' : '.';
+    if ($status->is_spam()) {
+      $result = "s";
+    } else {
+      $result = "h";
+    }
     $score = $status->get_score();
     $tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit())));
     $extra = join(",", @extra);
@@ -333,7 +460,7 @@
 
   $id =~ s/\s/_/g;
 
-  $out .= sprintf("%s %2d %s %s %s\n", $yorn, $score, $id, $tests, $extra);
+  $out .= sprintf("%s %s %05.2f %s %s %s\n", $class, $result, $score, $id, $tests, $extra);
 
   if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) {
     $out .= logkilled($ma, $id, "possible virus");
Index: masses/mk-baseline-results
===================================================================
--- masses/mk-baseline-results	(revision 20231)
+++ masses/mk-baseline-results	(working copy)
@@ -10,7 +10,7 @@
 echo "Classification success on test corpora, at default threshold:"
 echo
 
-./logs-to-c --spam=spam-validate.log --nonspam=nonspam-validate.log --threshold 5 --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
+./fp-fn-statistics --logfile=masses-validate.log --threshold 5 --scoreset=$SCORESET
 
 echo
 echo "Results on test corpora at various alternative thresholds:"
@@ -18,7 +18,7 @@
 
 # list a wide range of thresholds, so that we can make graphs later ;)
 for thresh in -4 -3 -2 -1 0 1 2 3 4 4.5 5.5 6 6.5 7 8 9 10 12 15 17 20 ; do
-  ./logs-to-c --spam=spam-validate.log --nonspam=nonspam-validate.log --threshold $thresh --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
+  ./fp-fn-statistics --logfile=masses-validate.log --threshold $thresh --scoreset=$SCORESET
   echo
 done
 
Index: masses/README
===================================================================
--- masses/README	(revision 20231)
+++ masses/README	(working copy)
@@ -33,8 +33,6 @@
 
 See the CORPUS_POLICY file for more details.
 
-
-
 HOW TO SUBMIT RESULTS BACK TO US
 --------------------------------
 
@@ -52,11 +50,11 @@
   This script is used to perform "mass checks" of a set of mailboxes, Cyrus
   folders, and/or MH mail spools.  It generates summary lines like this:
 
-  Y  7 /home/jm/Mail/Sapm/1382 SUBJ_ALL_CAPS,SUPERLONG_LINE,SUBJ_FULL_OF_8BITS
+  s s 07.22 /home/jm/Mail/Sapm/1382 SUBJ_ALL_CAPS,SUPERLONG_LINE,SUBJ_FULL_OF_8BITS
 
   or for mailboxes,
 
-  .  1 /path/to/mbox:<5.1.0.14.2.20011004073932.05f4fd28@localhost> TRACKER_ID,BALANCE_FOR_LONG
+  h h 01.32 /path/to/mbox:<5.1.0.14.2.20011004073932.05f4fd28@localhost> TRACKER_ID,BALANCE_FOR_LONG
 
   listing the path to the message or its message ID, its score, and the tests
   that triggered on that mail.
@@ -65,23 +63,23 @@
   get good hits with few false positives, etc., and re-score the tests to
   optimise the ratio.
 
-  This script relies on the spamassassin distribution directory living in "..".
+  If given the --dist option, this script relies on the spamassassin
+  distribution directory living in "..". If this script is not in the
+  distribution directory, it will generate logs based on the site-wide
+  rules, as well as personal rules.
 
-
 logs-to-c :
 
-  Takes the "spam.log" and "nonspam.log" files and converts them into C
-  source files and simplified data files for use by the C score optimization
-  algorithm.  (Called by "make" when you build the perceptron, so generally
-  you won't need to run it yourself.)
+  Takes the "masses.log" file and converts them into C source files
+  and simplified data files for use by the C score optimization
+  algorithm.  (Called by "make" when you build the perceptron, so
+  generally you won't need to run it yourself.)
 
-
 hit-frequencies :
 
   Analyses the log files and computes how often each test hits, overall,
   for spam mails and for non-spam.
 
-
 mk-baseline-results :
 
   Compute results for the baseline scores (read from ../rules/*).  If you
@@ -91,7 +89,6 @@
   It will output statistics on the current ruleset to ../rules/STATISTICS.txt,
   suitable for a release build of SpamAssassin.
 
-
 perceptron.c :
 
   Perceptron learner by Henry Stern.  See "README.perceptron" for details.
Index: masses/fp-fn-statistics
===================================================================
--- masses/fp-fn-statistics	(revision 20231)
+++ masses/fp-fn-statistics	(working copy)
@@ -1,3 +1,191 @@
-#!/bin/sh
+#!/usr/bin/perl -w
+#
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
 
-exec ./logs-to-c --count $*
+=head1 NAME
+
+fp-fn-statistics - Display statistics about the quality of scores
+
+=head1 SYNOPSIS
+
+fp-fn-statistics [options]
+
+  Options: 
+    -c,--cffile=path	  Use path as the rules directory
+    -s,--scoreset=n	  Use scoreset n
+    -l,--logfile=file	  Read in file instead of masses.log
+    -t,--threshold=n      Use a spam/ham threshold of n (default: 5)
+    --lambda=n            Use a lambda value of n
+
+=head1 DESCRIPTION
+
+B<fp-fn-statistics> first calculates the score each message from a
+masses.log would have under a new set of scores. It then aggregates
+the number of messages correctly and incorrectly found as spam and
+ham, and their average scores.
+
+In addition, B<fp-fn-statistics> determines the "Total Cost Ratio" as
+a result of the false positives and negatives mentioned above. This
+calculation takes into the value of lambda, which represents the cost
+of recovering a false positive, where 1 indicates a message is tagged
+only, 9 means the message is mailed back to sender asking for a token
+(TMDA style) and 999 means a message is delted. The default, 5,
+represents the message being moved to an infrequently read folder.
+
+=cut
+
+use FindBin;
+use lib "$FindBin::Bin/../lib";
+use Mail::SpamAssassin::Masses;
+use Getopt::Long qw(:config bundling auto_help);
+use Pod::Usage;
+use strict;
+use warnings;
+
+use vars qw{$opt_c $opt_l $opt_s $opt_t $opt_lambda};
+
+GetOptions("c|cffile=s@" => \$opt_c,
+	   "l|logfile=s" => \$opt_l,
+	   "s|scoreset=i" => \$opt_s,
+           "t|threshold=f" => \$opt_t,
+           "lambda" => \$opt_lambda);
+
+$opt_l ||= "masses.log";
+
+if (!$opt_c || !scalar(@$opt_c)) {
+    # Try to read this in from the log, if possible
+    open IN, $opt_l or die "Can't open $opt_l: $!";
+    my $files = 0; # are we in the files section?
+    while(<IN>) {
+	if (!$files) {
+	    if (/^\# SVN revision:/) {
+		$opt_c = [ "$FindBin::Bin/../rules" ];
+		last;
+	    } elsif (/^\# Using configuration:$/) {
+		$files = 1;
+	    }
+	} elsif (/^\#\s+(.*)\s*$/) {
+	    push (@$opt_c, $1);
+	} else {
+	    # All done!
+	    last;
+	}
+    }
+
+    if (!defined $opt_c) {
+      $opt_c = [ "$FindBin::Bin/../rules" ];
+    }
+
+    foreach my $file (@$opt_c) {
+	die "Can't read $file" unless -r $file;
+    }
+}
+
+$opt_t = (defined($opt_t) ? $opt_t : 5);
+$opt_s ||= 0;
+$opt_lambda ||= 5;
+
+my $nybias = 10;
+
+
+my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
+                                               scoreset => $opt_s, # ,,
+                                               logfile => $opt_l});
+
+$masses->readlogs();
+
+my $logs = $masses->get_logs();
+
+my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore) = (0,0,0,0,0,0,0,0);
+
+my $num_spam = $masses->get_num_spam();
+my $num_ham = $masses->get_num_ham();
+my $num_logs = $num_spam + $num_ham;
+
+my $count = 0;
+
+my $score;
+
+foreach my $log (@$logs) {
+
+  $score = 0;
+  foreach my $test (@{$log->{tests_hit}}) {
+
+    next if ($test->{issubrule});
+    next if (!$test->{score});
+
+    $score += $test->{score};
+
+  }
+
+  if ($score >= $opt_t) {
+    if ($log->{isspam}) {
+      $ga_yy++;
+      $yyscore += $score;
+    }
+    else {
+      $ga_ny++;
+      $nyscore += $score;
+    }
+  } else {
+    if ($log->{isspam}) {
+      $ga_yn++;
+      $ynscore += $score;
+    }
+    else {
+      $ga_nn++;
+      $nnscore += $score;
+    }
+  }
+}
+
+$nybias = $nybias * ($num_spam / $num_ham);
+
+my $fprate = ($ga_ny / $num_logs) * 100.0;
+my $fnrate = ($ga_yn / $num_logs) * 100.0;
+
+printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_t);
+printf "# Correctly non-spam: %6d  %4.2f%%  (%4.2f%% of non-spam corpus)\n", $ga_nn,
+  ($ga_nn /  $num_logs) * 100.0, ($ga_nn /  $num_ham) * 100.0;
+printf "# Correctly spam:     %6d  %4.2f%%  (%4.2f%% of spam corpus)\n" , $ga_yy,
+  ($ga_yy /  $num_logs) * 100.0, ($ga_yy /  $num_spam) * 100.0;
+printf "# False positives:    %6d  %4.2f%%  (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny,
+  $fprate, ($ga_ny /  $num_ham) * 100.0, $nyscore*$nybias;
+printf "# False negatives:    %6d  %4.2f%%  (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn,
+  $fnrate, ($ga_yn /  $num_spam) * 100.0, $ynscore;
+
+# convert to the TCR metrics used in the published lit
+my $nspamspam = $ga_yy;
+my $nspamlegit = $ga_yn;
+my $nlegitspam = $ga_ny;
+my $nlegitlegit = $ga_yn;
+my $nlegit = $num_ham;
+my $nspam = $num_spam;
+
+my $werr = ($opt_lambda * $nlegitspam + $nspamlegit)
+  / ($opt_lambda * $nlegit + $nspam);
+
+my $werr_base = $nspam
+  / ($opt_lambda * $nlegit + $nspam);
+
+$werr ||= 0.000001;     # avoid / by 0
+my $tcr = $werr_base / $werr;
+
+my $sr = ($nspamspam / $nspam) * 100.0;
+my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
+printf "# TCR: %3.6f  SpamRecall: %3.3f%%  SpamPrec: %3.3f%%  FP: %3.2f%%  FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate;
+
Index: masses/extract-message-from-mbox
===================================================================
--- masses/extract-message-from-mbox	(revision 20231)
+++ masses/extract-message-from-mbox	(working copy)
@@ -19,30 +19,61 @@
 use bytes;
 
 use vars qw {
-  $opt_f $opt_h $opt_m $opt_H
+  $opt_h $opt_m
 };
 
-use Getopt::Std;
-getopts("f:hmH");
 
-sub usage {
-  die "extract-message-from-mbox [-f=file] [-m] [-H] offset
+use Getopt::Long qw(:config bundling auto_help);
+use Pod::Usage;
 
-  Extracts the message starting at offset from file (or stdin). Very
-  useful in combination with mass-check logs and mboxes. If the -m
-  option is used, the input should be in \"mass-check\" format (as
-  output by mass-check). Use the -H option to just output headers.
-";
-}
+GetOptions("m|mass-check" => \$opt_m, "h|H|headers" => \$opt_h);
 
-usage() if($opt_h || (!defined($ARGV[0]) && !$opt_m));
-my $offset = $ARGV[0];
+=head1 NAME
 
+extract-message-from-mbox - Extract a message from an mbox
+
+=head1 SYNOPSIS
+
+ extract-message-from-mbox [--headers] <mbox>.<offset>
+ extract-message-from-mbox --mass-check
+
+ Options:
+  -h, --headers       Display only message headers
+  -m, --masscheck     Read mass-check output from stdin
+
+=head1 DESCRIPTION
+
+B<extract-message-from-mbox> extracts the message from I<mbox>
+starting at the byte offset I<offset>. Very useful in combination with
+mass-check logs and mboxes. If the -m or --mass-check option is used,
+the input should be in "mass-check" format (as output by
+mass-check). Use the -H option to just output headers.
+
+=head1 EXAMPLES
+
+To show messages that hit the rule BAYES_99
+
+grep BAYES_99 masses.log | extract-message-from-mbox -m
+
+To show the message indicated by "/path/to/my/mbox.1234"
+
+extract-message-from-mbox /path/to/my/mbox.1234
+
+=cut
+
+
+
 if($opt_m) {
   masscheck();
 } else {
-  $opt_f ||= '&STDIN';
-  extract($opt_f, $offset);
+  foreach my $message (@ARGV) {
+    if ($message =~ /^(.*?)(?:\.(\d+))?$/) {
+      extract($1, ($2 || 0));
+    }
+    else {
+      pod2usage("Argument must be of the form <mbox>.<offset>");
+    }
+  }
 }
 
 sub extract {
@@ -61,14 +92,14 @@
       $found++ if(/^From /);
       last if($found == 3);
       print;
-      last if ($opt_H && /^$/) # empty line? end of headers
+      last if ($opt_h && /^$/) # empty line? end of headers
     }
   }
 }
 
 sub masscheck {
   while (<STDIN>) {
-    my $mail = (split(/\s+/, $_))[2];
+    my $mail = (split(/\s+/, $_))[3];
     $mail =~ tr/_/ /;
     if ($mail =~ /^(.*)\.(\d+)$/) {
       extract($1, $2);
Index: masses/logs-to-c
===================================================================
--- masses/logs-to-c	(revision 20231)
+++ masses/logs-to-c	(working copy)
@@ -16,257 +16,250 @@
 # limitations under the License.
 # </@LICENSE>
 
-use Getopt::Long;
-use vars qw($opt_cffile $opt_count $opt_lambda $opt_threshold
-		$opt_spam $opt_nonspam);
+=head1 NAME
 
-GetOptions("cffile=s", "count", "lambda=f", "threshold=f", "spam=s", "nonspam=s", "scoreset=i");
-my $argcffile = $opt_cffile;
+logs-to-c - Convert a mass-check log into perceptron format
 
-my $justcount = 0;
-if ($opt_count) { $justcount = 1; }
+=head1 SYNOPSIS
 
-my $threshold = 5;
-if (defined $opt_threshold) { $threshold = $opt_threshold; }
+logs-to-c [options]
 
-$opt_spam ||= 'spam.log';
-$opt_nonspam ||= 'ham.log';
-$opt_scoreset = 0 if ( !defined $opt_scoreset );
+ Options:
+    -c,--cffile=path	  Use path as the rules directory
+    -s,--scoreset=n	  Use scoreset n
+    -l,--logfile=file	  Read in file instead of masses.log
+    -o,--outputdir        Put output in the specified dir (default tmp/)
 
-my $nybias = 10;
+=head1 DESCRIPTION
 
-# lambda value for TCR equation, indicating the "cost" of recovering
-# from an FP.  The values are: 1 = tagged only, 9 = mailed back to
-# sender asking for token (TMDA style), 999 = deleted outright.
-# We (SpamAssassin) use a default of 5, representing "moved to
-# infrequently-read folder".
+B<logs-to-c> will read the mass-check log F<masses.log> or as
+specified by the B<--logfile> option, and convert it into the format
+needed by the perceptron. This is a format that is simple for the
+perceptron to parse, but is not very readable to humans.
 
-my $lambda = 5;
-if ($opt_lambda) { $lambda = $opt_lambda; }
+By default, output will be put in the directory ./tmp/ unless another
+directory is specified by the B<--outputdir> option. (Note: at the
+current time, this must be /tmp/ in order for the perceptron to
+compile properly.)
 
-my %is_spam = ();
-my %tests_hit = ();
-my %mutable_tests = ();
+=head1 BUGS
 
-use vars qw(%rules %allrules);
+Please report bugs to http://bugzilla.spamassassin.org/
 
-readscores();
+=head1 SEE ALSO
 
-print "Reading per-message hit stat logs and scores...\n";
-my ($num_tests, $num_spam, $num_nonspam);
-my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore);
+L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
 
-readlogs();
-read_ranges();
+=cut
 
-if ($justcount) {
-  $nybias = $nybias*($num_spam / $num_nonspam);
-  evaluate();
-} else {
-  print "Writing logs and current scores as C code...\n";
-  writescores_c();
-}
-exit 0;
+use FindBin;
+use lib "$FindBin::Bin/../lib";
+use Mail::SpamAssassin::Masses;
+use Getopt::Long qw(:config bundling auto_help);
+use Pod::Usage;
+use strict;
+use warnings;
 
+use vars qw{$opt_c $opt_l $opt_s $opt_o};
 
-sub readlogs {
-  my $count = 0;
-  $num_spam = $num_nonspam = 0;
+GetOptions("c|cffile=s@" => \$opt_c,
+	   "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
+	   "l|logfile=s" => \$opt_l,
+	   "o|output=s" => \$opt_o);
 
-  if ($justcount) {
-    $ga_yy = $ga_ny = $ga_yn = $ga_nn = 0;
-    $yyscore = $ynscore = $nyscore = $nnscore = 0.0;
-  }
 
-  foreach my $file ($opt_spam, $opt_nonspam) {
-    open (IN, "<$file");
+$opt_o ||= "./tmp/";
+if (!-d $opt_o) {
+  mkdir $opt_o, 0777 or die "Can't mkdir $opt_o";
+}
 
-    while (<IN>) {
-      next if /^\#/;
-      next if /^$/;
-      if($_ !~ /^.\s+([-\d]+)\s+\S+\s*/) { warn "bad line: $_"; next; }
-      my $hits = $1;
-#my $foo = $_;
-      $_ = $'; s/(?:bayes|time)=\S+//; s/,,+/,/g; s/^\s+//; s/\s+$//;
+$opt_l ||= "masses.log";
 
-      my $score = 0;
-      my @tests = ();
-      foreach my $tst (split (/,/, $_)) {
-	next if ($tst eq '');
-	if (!defined $scores{$tst}) {
-          #warn "unknown test in $file, ignored: $tst\n";
-	  next;
+if (!$opt_c || !scalar(@$opt_c)) {
+    # Try to read this in from the log, if possible
+    open IN, $opt_l or die "Can't open $opt_l: $!";
+    my $files = 0; # are we in the files section?
+    while(<IN>) {
+	if (!$files) {
+	    if (/^\# SVN revision:/) {
+		$opt_c = [ "$FindBin::Bin/../rules" ];
+		last;
+	    } elsif (/^\# Using configuration:$/) {
+		$files = 1;
+	    }
+	} elsif (/^\#\s+(.*)\s*$/) {
+	    push (@$opt_c, $1);
+	} else {
+	    # All done!
+	    last;
 	}
+    }
 
-	# Make sure to skip any subrules!
-	next if ( $allrules{$tst}->{issubrule} );
+    if (!defined $opt_c) {
+      $opt_c = [ "$FindBin::Bin/../rules" ];
+    }
 
-        if ($justcount) {
-          $score += $scores{$tst};
-        } else {
-          push (@tests, $tst);
-        }
-      }
-
-      if (!$justcount) { 
-        $tests_hit{$count} = \@tests;
-      }
-
-      if ($file eq $opt_spam) {
-	$num_spam++;
-        if ($justcount) {
-          if ($score >= $threshold) {
-            $ga_yy++; $yyscore += $score;
-          } else {
-            $ga_yn++; $ynscore += $score;
-          }
-        } else {
-          $is_spam{$count} = 1;
-        }
-      } else {
-	$num_nonspam++;
-        if ($justcount) {
-          if ($score >= $threshold) {
-#print "$score -- $foo";
-            $ga_ny++; $nyscore += $score;
-          } else {
-            $ga_nn++; $nnscore += $score;
-          }
-        } else {
-          $is_spam{$count} = 0;
-        }
-      }
-      $count++;
+    foreach my $file (@$opt_c) {
+	die "Can't read $file" unless -r $file;
     }
-    close IN;
-  }
-  $num_tests = $count;
 }
 
+# ignore rules that are subrules -- we don't generate scores for them...
 
-sub readscores {
-  if (!defined $argcffile) { $argcffile = "../rules"; }
-  print "Reading scores from \"$argcffile\"...\n";
-  system ("./parse-rules-for-masses -d \"$argcffile\" -s $opt_scoreset") and die;
-  require "./tmp/rules.pl";
-  %allrules = %rules;           # ensure it stays global
-}
+# Note: this will cause a difference over the old logs-to-c since rank
+# is dependent on the frequencies of all rules, not just non-subrules
 
+my $greprules = sub { return 0 if $_[1]->{issubrule}; return 1; };
+
+$opt_s ||= 0; # |
+
+my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
+					       scoreset => $opt_s, # ,,
+					       logfile => $opt_l,
+                                               greprules => $greprules });
+
+$masses->readlogs();
+$masses->do_score_ranges();
+
+my $rules = $masses->get_rules_array();
+my $logs = $masses->get_logs();
+
+my @index_to_rule;
+my $num_spam = $masses->get_num_spam();
+my $num_ham = $masses->get_num_ham();
+
+# This is misleading -- num_tests is really num_msgs
+my $num_tests = $num_spam + $num_ham;
+
+
+# Write logs and scores as C code
+writescores_c();
+writetests_c();
+
+
 sub writescores_c {
-  my $output = '';
-  my $size = 0;
+
   my $mutable = 0;
-  my $i;
+  my $output = '';
+  my $count = 0;
+  my $score = 0;
 
-    # jm: now, score-ranges-from-freqs has tflags to work from, so
-    # it will always list all mutable tests.
+  foreach my $rule (sort {($b->{ismutable} <=> $a->{ismutable}) ||
+			  ($a->{name} cmp $b->{name}) } @$rules) {
 
-  @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) ||
-			  ($mutable_tests{$b} <=> $mutable_tests{$a}) ||
-			   ($a cmp $b)} (keys %scores);
-  my $max_hits_per_msg = 0;
-  for ($file = 0; $file < $num_tests; $file++) {
-    my(@hits) =
-     grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (@{$tests_hit{$file}});
-    if ((scalar(@hits)+1) > $max_hits_per_msg) {
-      $max_hits_per_msg = scalar(@hits)+1;
-    }
-  }
+    $score = $rule->{score};
 
-  for ($i = 0; $i <= $#index_to_rule; $i++) {
-    my $name = $index_to_rule[$i];
-    $rule_to_index{$name} = $i;
+    # ignored rules (i.e. no scores)
+    next unless $score;
 
-    if ($ignored_rule{$name}) { next; }
+    # also ignore rules with score range 0
+    next if (!$rule->{range_lo} && !$rule->{range_hi});
 
-    if ($mutable_tests{$name} == 0) {
-      $range_lo{$name} = $range_hi{$name} = $scores{$name};
-    } else {
+    # Set an index
+    $rule->{index} = $count;
+    $index_to_rule[$count] = $rule; # add the reference to the array
+
+    if ($rule->{ismutable}) {
       $mutable++;
-      if ($range_lo{$name} > $range_hi{$name}) {
-	($range_lo{$name},$range_hi{$name}) =
-	 ($range_hi{$name},$range_lo{$name});
+      if ($score > $rule->{range_hi}) {
+	$score = $rule->{range_hi} - 0.001;
+      } elsif ($score < $rule->{range_lo}) {
+	$score = $rule->{range_lo} + 0.001;
       }
-      #$range_lo{$name} ||= 0.1;
-      #$range_hi{$name} ||= 1.5;
     }
+    # These should all be set properly if not mutable
+    # score = range_lo = range_hi
+    else {
+      warn "hi != lo for " . $rule->{name} . "!" if $rule->{range_lo} != $rule->{range_hi};
+      $score = $rule->{range_hi} = $rule->{range_lo};
+    }
 
-    $output .= ".".$i."\n".
-                "n".$name."\n".
-                "b".$scores{$name}."\n".
-                "m".$mutable_tests{$name}."\n".
-                "l".$range_lo{$name}."\n".
-                "h".$range_hi{$name}."\n";
-    $size++;
+    $output .= "." . $count . "\n" .
+         "n" . $rule->{name} . "\n" .
+	 "b" . $score . "\n" .
+	 "m" . $rule->{ismutable} . "\n" .
+	 "l" . $rule->{range_lo} . "\n" .
+	 "h" . $rule->{range_hi} . "\n";
+
+    $count++;
+
   }
 
+  # Output this
 
-  open (DAT, ">tmp/scores.data");
-  print DAT "N$size\n", "M$mutable\n", # informational only
-   $output;
+  open (DAT, ">$opt_o/scores.data");
+  print DAT "N$count\n", "M$mutable\n"; # informational
+  print DAT $output;
   close DAT;
 
-  open (OUT, ">tmp/scores.h");
-  print OUT "
+  open (OUT, ">$opt_o/scores.h");
+  print OUT <<EOF;
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
-
-int num_scores = $size;
+ 
+int num_scores = $count;
 int num_mutable = $mutable;
-unsigned char is_mutable[$size];
-double range_lo[$size];
-double range_hi[$size];
-double bestscores[$size];
-char *score_names[$size];
-double tmp_scores[$size][2];
+unsigned char is_mutable[$count];
+double range_lo[$count];
+double range_hi[$count];
+double bestscores[$count];
+char *score_names[$count];
+double tmp_scores[$count][2];
 unsigned char ny_hit[$mutable];
 unsigned char yn_hit[$mutable];
-
+ 
 double lookup[$mutable];
-
+ 
 /* readscores() is defined in tests.h */
+EOF
 
-";
   close OUT;
 
-  writetests_c($max_hits_per_msg); # make sure $rule_to_index is around
 }
 
+
 sub writetests_c {
-  my $max_hits_per_msg = $_[0];
 
-  my(%uniq_files) = ();
-  my(%count_keys) = ();
-  my(%file_key) = ();
+  my $max_hits_per_msg = 0;
+  my @goodtests;
+  my %uniq_logs;
+  my $uniq_key;
 
-  my $file;
+  my $i = 0;
 
-  for ($file = 0; $file < $num_tests; $file++)
-  {
-    my $uniq_key = $is_spam{$file} . " ";
+  # This will "compress" the logs so that one log entry can have a
+  # "count" of n indicating it reprents n similar messages
 
-    my(@good_tests) =
-     grep {length($_) && (! $ignored_rule{$_}) &&
-	    (defined($rule_to_index{$_}))} (@{ $tests_hit{$file} });
+  foreach my $log (@$logs) {
 
-    @good_tests = sort {$a <=> $b} (map {$rule_to_index{$_}} (@good_tests));
+    (@goodtests) = grep {exists($_->{index})} (@{$log->{tests_hit}});
+    @goodtests = sort {$a <=> $b} map {$_->{index}} @goodtests;
 
-    $uniq_key .= join(" ",@good_tests);
+    if($max_hits_per_msg < scalar(@goodtests)) {
+      $max_hits_per_msg = scalar(@goodtests);
+    }
 
-    if (exists($count_keys{$uniq_key})) {
-      $count_keys{$uniq_key}++;
+    $uniq_key = $log->{isspam} ? "s" : "";
+    $uniq_key .= join(" ", @goodtests);
+
+
+    # The %count_keys hash's entries will be the log info for each unique log
+    # $log->{count} is increased to indicate similar logs
+
+    if (exists($uniq_logs{$uniq_key})) {
+      $uniq_logs{$uniq_key}->{count}++;
     } else {
-      $count_keys{$uniq_key} = 1;
-      $file_key{$file} = $uniq_key;
-      $uniq_files{$file} = scalar(keys(%count_keys)) - 1;
+      $uniq_logs{$uniq_key} = $log;
+      $uniq_logs{$uniq_key}->{count} = 1;
     }
+
   }
 
-  my $num_nondup = scalar(keys(%uniq_files));
+  my $num_nondup = scalar(keys %uniq_logs);
 
-  open (TOP, ">tmp/tests.h");
-  print TOP "
+  open TOP, ">$opt_o/tests.h";
+  print TOP <<EOF;
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
@@ -274,7 +267,7 @@
 int num_tests = $num_tests;
 int num_nondup = $num_nondup;
 int num_spam = $num_spam;
-int num_nonspam = $num_nonspam;
+int num_nonspam = $num_ham;
 int max_hits_per_msg = $max_hits_per_msg;
 unsigned char num_tests_hit[$num_nondup];
 unsigned char is_spam[$num_nondup];
@@ -282,196 +275,77 @@
 double scores[$num_nondup];
 double tmp_total[$num_nondup];
 int tests_count[$num_nondup];
+EOF
 
-";
-  $_ = join ('', <DATA>);
-  print TOP $_;
+
+  print TOP join('', <DATA>);
   close TOP;
 
-  open (DAT, ">tmp/tests.data");
 
-  foreach $file (sort {$a <=> $b} (keys %uniq_files)) {
-    print DAT ".".$uniq_files{$file}."\n";
+  open (DAT, ">$opt_o/tests.data");
 
-    my $out = '';
-    $out .= "s".$is_spam{$file}."\n";
+  my $out;
+  my $base_score;
+  my $num_tests_hit;
 
-    my $base_score = 0;
-    my $num_tests_hit = 0;
-    foreach my $test (@{$tests_hit{$file}}) {
-      if ($test eq '') { next; }
+  $i = 0;
+  foreach my $log (values %uniq_logs) {
+    $out = '';
+    $base_score = $num_tests_hit = 0;
 
-      if ($ignored_rule{$test}) {
-        warn "ignored rule $test got a hit in $file!\n";
-        next;
+    print DAT "." . $i . "\n";
+
+    $out .= "s" . ( ($log->{isspam})? 1 : 0 ) . "\n";
+
+    foreach my $test (@{$log->{tests_hit}}) {
+      if (!$test->{score}) {
+	# Don't really know why this happens, but the old logs-to-c
+	#did it too
+
+	warn "ignored rule " . $test->{name} . " got a hit!";
+	next;
       }
 
-      if (!defined $rule_to_index{$test}) {
-	warn "test with no C index: $test\n";
+      if (!$test->{range_lo} && !$test->{range_hi}) {
+	# We ignored this rule
 	next;
       }
 
-      if ($mutable_tests{$test}) {
-      $num_tests_hit++;
-      $out .= "t".$rule_to_index{$test}."\n";
-
-      if ($num_tests_hit >= $max_hits_per_msg) {
-	die "Need to increase \$max_hits_per_msg";
+      # debugging...
+      if (!defined $test->{index}) {
+	warn "test with no index";
       }
-      } else {
-	$base_score += $scores{$test};
-      }
-    }
 
-    $out .= "b" . $base_score . "\n"; # score to add in for non-mutable tests
-    $out .= "c" . $count_keys{$file_key{$file}} . "\n";
+      if ($test->{ismutable}) {
+	$num_tests_hit++;
+	$out .= "t".$test->{index}."\n";
 
-    print DAT "n".$num_tests_hit."\n".$out;
-  }
-  close DAT;
-}
+	if ($num_tests_hit >= $max_hits_per_msg) {
+	  die "\$max_hits_per_msg not big enough!";
+	}
 
-sub read_ranges {
-  if (!-f 'tmp/ranges.data') {
-    system ("make tmp/ranges.data");
-  }
+      }
+      else {
+	$base_score += $test->{score};
+      }
 
-  # read ranges, and mutableness, from ranges.data.
-  open (IN, "<tmp/ranges.data")
-  	or die "need to run score-ranges-from-freqs first!";
-
-  my $count = 0;
-  while (<IN>) {
-    /^(\S+) (\S+) (\d+) (\S+)$/ or next;
-    my $t = $4;
-    $range_lo{$t} = $1+0;
-    $range_hi{$t} = $2+0;
-    my $mut = $3+0;
-
-    if ($allrules{$t}->{issubrule}) {
-      $ignored_rule{$t} = 1;
-      $mutable_tests{$t} = 0;
-      next;
     }
-    if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) {
-      #warn "ignored rule: score and range == 0: $t\n";
-      $ignored_rule{$t} = 1;
-      $mutable_tests{$t} = 0;
-      next;
-    }
 
-    $ignored_rule{$t} = 0;
-    $index_to_rule[$count] = $t;
-    $count++;
+    $out .= "b" . $base_score . "\n"; # score to add for non-mutable tests
+    $out .= "c" . $log->{count} . "\n"; # number of identical logs
 
-    if (!$mut) {
-      $mutable_tests{$t} = 0;
-    } elsif ($range_lo{$t} == $range_hi{$t}) {
-      $mutable_tests{$t} = 0;
-    } elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
-      $mutable_tests{$t} = 0;
-    } else {
-      $mutable_tests{$t} = 1;
-    }
-    unless ($mutable_tests{$t} || $scores{$t}) {
-      $ignored_rule{$t} = 1;
-    }
-  }
-  close IN;
+    print DAT "n" . $num_tests_hit . "\n" . $out;
 
-  # catch up on the ones missed; seems to be userconf or 0-hitters mostly.
-  foreach my $t (sort keys %allrules) {
-    next if (exists($range_lo{$t}));
-    if ($allrules{$t}->{issubrule}) {
-      $ignored_rule{$t} = 1;
-      $mutable_tests{$t} = 0;
-      next;
-    }
-    $ignored_rule{$t} = 0;
-    unless (exists($mutable_tests{$t}) &&
-	    ($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) {
-      $mutable_tests{$t} = 0;
-    }
-    unless ($mutable_tests{$t} || $scores{$t}) {
-      $ignored_rule{$t} = 1;
-    }
-    $index_to_rule[$count] = $t;
-    $count++;
+    $i++;
   }
-  foreach my $t (keys %range_lo) {
-    next if ($ignored_rule{$t});
-    if ($mutable_tests{$t}) {
-      if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
-	$scores{$t} = -1;
-      } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
-	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
-	$scores{$t} = -0.01;
-      }
-      if ($scores{$t} >= $range_hi{$t}) {
-	$scores{$t} = $range_hi{$t} - 0.001;
-      } elsif ($scores{$t} <= $range_lo{$t}) {
-	$scores{$t} = $range_lo{$t} + 0.001;
-      }
-    } else {
-      if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
-	next;
-      } elsif ($range_lo{$t} == $range_hi{$t}) {
-	$scores{$t} = $range_lo{$t};
-	next;
-      }
-      if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
-	$scores{$t} = -1;
-      } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
-	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
-	$scores{$t} = -0.01;
-      }
-      if ($scores{$t} > $range_hi{$t}) {
-	$scores{$t} = $range_hi{$t};
-      } elsif ($scores{$t} < $range_lo{$t}) {
-	$scores{$t} = $range_lo{$t};
-      }
-    }
-  }
-}
 
-sub evaluate {
-   my $fprate = ($ga_ny / $num_tests) * 100.0;
-   my $fnrate = ($ga_yn / $num_tests) * 100.0;
+  close DAT;
 
-   printf ("\n# SUMMARY for threshold %3.1f:\n", $threshold);
-   printf "# Correctly non-spam: %6d  %4.2f%%  (%4.2f%% of non-spam corpus)\n", $ga_nn,
-       ($ga_nn /  $num_tests) * 100.0, ($ga_nn /  $num_nonspam) * 100.0;
-   printf "# Correctly spam:     %6d  %4.2f%%  (%4.2f%% of spam corpus)\n" , $ga_yy,
-       ($ga_yy /  $num_tests) * 100.0, ($ga_yy /  $num_spam) * 100.0;
-   printf "# False positives:    %6d  %4.2f%%  (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny,
-       $fprate, ($ga_ny /  $num_nonspam) * 100.0, $nyscore*$nybias;
-   printf "# False negatives:    %6d  %4.2f%%  (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn,
-       $fnrate, ($ga_yn /  $num_spam) * 100.0, $ynscore;
 
-  # convert to the TCR metrics used in the published lit
-  my $nspamspam = $ga_yy;
-  my $nspamlegit = $ga_yn;
-  my $nlegitspam = $ga_ny;
-  my $nlegitlegit = $ga_yn;
-  my $nlegit = $num_nonspam;
-  my $nspam = $num_spam;
-
-  my $werr = ($lambda * $nlegitspam + $nspamlegit)
-                  / ($lambda * $nlegit + $nspam);
-
-  my $werr_base = $nspam
-                  / ($lambda * $nlegit + $nspam);
-
-  $werr ||= 0.000001;     # avoid / by 0
-  my $tcr = $werr_base / $werr;
-
-  my $sr = ($nspamspam / $nspam) * 100.0;
-  my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
-  printf "# TCR: %3.6f  SpamRecall: %3.3f%%  SpamPrec: %3.3f%%  FP: %3.2f%%  FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate;
 }
 
-__DATA__
 
+__DATA__
 void loadtests (void) {
   FILE *fin = fopen ("tmp/tests.data", "r");
   char buf[256];
@@ -557,4 +431,3 @@
 
   printf ("Read scores for %d tests.\n", num_scores);
 }
-
Index: masses/post-ga-analysis.pl
===================================================================
--- masses/post-ga-analysis.pl	(revision 20231)
+++ masses/post-ga-analysis.pl	(working copy)
@@ -7,9 +7,8 @@
 my %scores;
 my %rulehit;
 
-open(SPAM, "<spam.log");
-open(NONSPAM, "<nonspam.log");
-open(SCORES, "<newscores");
+open(LOGS, "<masses.log");
+open(SCORES, "<perceptron.scores");
 
 while(<SCORES>)
 {
@@ -22,11 +21,12 @@
 
 close(SCORES);
 
-while(<SPAM>)
+while(<LOGS>)
 {
     next if /^#/;
-    /.\s+[-0-9]*\s+[^\s]+\s+([^\s]*)(\s+?:(?:bayes|time)=\S+)\s*?$/;
-    my @rules=split /,/,$1;
+    /(.)\s+.\s+[-0-9]*\s+[^\s]+\s+([^\s]*)(\s+?:(?:bayes|time)=\S+)\s*?$/;
+    my $class = $1;
+    my @rules=split /,/,$2;
     my $score = 0.0;
     foreach $rule (@rules)
     {
@@ -35,7 +35,7 @@
 	$rulehit{$rule}++;
     }
 
-    if($score < 5)
+    if($class eq "s" && $score < 5)
     {
 	foreach $rule (@rules)
 	{
@@ -44,27 +44,8 @@
 	}
 	$nfn++;
     }
-}
-
-close(SPAM);
-
-while(<NONSPAM>)
-{
-    next if /^#/;
-    /.\s+[-0-9]*\s+[^\s]+\s+([^\s]*)\s*$/;
-    next unless defined($1);
-
-    my @rules=split /,/,$1;
-    my $score = 0.0;
-    foreach $rule (@rules)
+    if($class eq "h" && score >= 5)
     {
-        next unless (defined ($scores{$rule}));
-	$score += $scores{$rule};
-	$rulehit{$rule}++;
-    }
-
-    if($score >= 5)
-    {
 	foreach $rule (@rules)
 	{
             next unless (defined ($scores{$rule}));
@@ -72,8 +53,11 @@
 	}
 	$nfp++;
     }
+
 }
 
+close(LOGS);
+
 @fpk = sort { $falsepos{$b}/($rulehit{$b}||0.0001) <=> $falsepos{$a}/($rulehit{$a}||0.00001) } keys %falsepos;
 
 print "COMMON FALSE POSITIVES: ($nfp total)\n-----------------------\n\n";
Index: masses/convert-old-logs-to-new
===================================================================
--- masses/convert-old-logs-to-new	(revision 0)
+++ masses/convert-old-logs-to-new	(revision 0)
@@ -0,0 +1,15 @@
+#!/bin/sh -e
+
+cat spam.log | perl -ne's/^Y/s s/; s/^\./s h/; print unless /^\#/;' \
+  > spam.log.sorted
+
+cat ham.log | perl -ne's/^Y/h s/; s/^\./h h/; print unless /^\#/;' \
+  > ham.log.sorted
+
+# sort by time
+
+echo \# SVN revision: > masses.log
+
+sort --field-separator='=' -n -k2,2 --merge spam.log.sorted ham.log.sorted \
+  >> masses.log
+

Property changes on: masses/convert-old-logs-to-new
___________________________________________________________________
Name: svn:executable
   + *

Index: masses/score-ranges-from-freqs
===================================================================
--- masses/score-ranges-from-freqs	(revision 20231)
+++ masses/score-ranges-from-freqs	(working copy)
@@ -1,251 +0,0 @@
-#!/usr/bin/perl -w
-#
-# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# </@LICENSE>
-
-# (rough) graphic demo of this algorithm:
-# 0.0  = -limit [......] 0 ........ limit
-# 0.25 = -limit ..[..... 0 .]...... limit
-# 0.5  = -limit ....[... 0 ...].... limit
-# 0.75 = -limit ......[. 0 .....].. limit
-# 1.0  = -limit ........ 0 [......] limit
-my $sliding_window_limits = 4.8; # limits = [-$range, +$range]
-my $sliding_window_size =   5.5; # scores have this range within limits
-
-# 0.0  = -limit [......] 0 ........ limit
-# 0.25 = -limit ....[... 0 ]....... limit
-# 0.5  = -limit ......[. 0 .]...... limit (note: tighter)
-# 0.75 = -limit .......[ 0 ...].... limit
-# 1.0  = -limit ........ 0 [......] limit
-my $shrinking_window_lower_base =   0.00; 
-my $shrinking_window_lower_range =  1.00; # *ratio, added to above
-my $shrinking_window_size_base =    1.00;
-my $shrinking_window_size_range =   1.00; # *ratio, added to above
-
-my $use_sliding_window = 0;
-
-my $argcffile = shift @ARGV;
-my $scoreset = shift @ARGV;
-$scoreset = 0 if ( !defined $scoreset );
-
-if (defined ($argcffile) && $argcffile eq '-test') {
-  # use this to debug the ranking -> score-range mapping:
-  for $rat (0.0, 0.25, 0.5, 0.75, 1.0) {
-    my ($lo, $hi); if ($use_sliding_window) {
-      ($lo, $hi) = sliding_window_ratio_to_range($rat);
-    } else {
-      ($lo, $hi) = shrinking_window_ratio_to_range($rat);
-    }
-    warn "test: $rat => [ $lo $hi ]\n";
-  } exit;
-}
-
-my %freq_spam = ();
-my %freq_nonspam = ();
-
-my $num_spam;
-my $num_nonspam;
-my $num_total;
-
-my %mutable_tests = ();
-my %ranking = ();
-my %soratio = ();
-my %is_nice = ();
-
-if (!defined $argcffile) { $argcffile = "../rules"; }
-system ("./parse-rules-for-masses -d \"$argcffile\" -s $scoreset") and die;
-if (-e "tmp/rules.pl") {
-  # Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem
-  require "./tmp/rules.pl";
-}
-else {
-  die "parse-rules-for-masses had no error but no tmp/rules.pl!?!";
-}
-
-while (<>) {
-  /^\s*([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+\S+\s+(.+)\s*$/ or next;
-
-  my $overall = $1+0;
-  my $spam = $2+0;
-  my $nonspam = $3+0;
-  my $soratio = $4+0;
-  my $ranking = $5+0;
-  my $test = $6;
-
-  if ($test eq '(all messages)') {
-    $num_spam = $spam;
-    $num_nonspam = $nonspam;
-    $num_total = $spam+$nonspam;
-    next;
-  }
-  next if ($test eq '(all messages as %)');
-
-  if (!defined ($rules{$test})) {
-    warn "rule $test no longer exists; ignoring\n";
-    next;
-  }
-
-  $freq{$test} = $overall;
-  $freq_spam{$test} = $spam;
-  $freq_nonspam{$test} = $nonspam;
-
-  my $tflags = $rules{$test}->{tflags}; $tflags ||= '';
-  if ($tflags =~ /\buserconf\b/ ||
-      ( ($scoreset % 2) == 0 && $tflags =~ /\bnet\b/ )) {
-    $mutable_tests{$test} = 0;
-  } else {
-    $mutable_tests{$test} = 1;
-  }
-  if ($tflags =~ m/\bnice\b/i) {
-    $is_nice{$test} = 1;
-  } else {
-    $is_nice{$test} = 0;
-  }
-
-  if ($overall < 0.01) {        # less than 0.01% of messages were hit
-    $mutable_tests{$test} = 0;
-    $soratio{$test} = 0.5;
-    $ranking{$test} = 0.0;
-    $rules{$test}->{score} = 0; # tvd - disable these rules automagically
-
-  } else {
-    $soratio{$test} = $soratio;
-    $ranking{$test} = $ranking;
-  }
-}
-
-if ( ! mkdir "tmp", 0755 ) {
-  warn "Couldn't create tmp directory!: $!\n";
-}
-
-open (OUT, ">tmp/ranges.data");
-foreach my $test (sort { $ranking{$b} <=> $ranking{$a} } keys %freq) {
-  if (!defined ($rules{$test})) {
-    warn "no rule $test";
-    print OUT ("0 0 0 $test\n");
-    next;
-  }
-
-  my $overall = $freq{$test};
-  my $spam = $freq_spam{$test};
-  my $nonspam = $freq_nonspam{$test};
-  my $soratio = $soratio{$test};
-  my $ranking = $ranking{$test};
-  my $mutable = $mutable_tests{$test};
-
-  if (!$mutable || $rules{$test}->{score} == 0) { # didn't look for score 0 - tvd
-    printf OUT ("%3.3f %3.3f 0 $test\n",
-                         $rules{$test}->{score},
-                         $rules{$test}->{score});
-    next;
-  }
-
-  # 0.0 = best nice, 1.0 = best nonnice
-  if ($is_nice{$test}) {
-    $ranking = .5 - ($ranking / 2);
-  } else {
-    $ranking = .5 + ($ranking / 2);
-  }
-
-  my ($lo, $hi);
-  if ($use_sliding_window) {
-    ($lo, $hi) = sliding_window_ratio_to_range($ranking);
-  } else {
-    ($lo, $hi) = shrinking_window_ratio_to_range($ranking);
-  }
-
-  # tvd
-  my $tflags = $rules{$test}->{tflags}; $tflags ||= '';
-  if ( $is_nice{$test} && ( $ranking < .5 ) ) { # proper nice rule
-    if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score # -5.4
-      $lo *=1.8;
-    }
-    elsif ($soratio <= 0.05 && $nonspam > 0.5) { # let good rules be larger if they want to, -4.5
-      $lo *= 1.5;
-    }
-
-    $hi =	($soratio == 0) ? $lo :
-    		($soratio <= 0.005 ) ? $lo/1.1 :
-    		($soratio <= 0.010 && $nonspam > 0.2) ? $lo/2.0 :
-		($soratio <= 0.025 && $nonspam > 1.5) ? $lo/10.0 :
-		0;
-
-    if ( $soratio >= 0.35 ) { # auto-disable bad rules
-      ($lo,$hi) = (0,0);
-    }
-  }
-  elsif ( !$is_nice{$test} && ( $ranking >= .5 ) ) { # proper spam rule
-    if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score
-      $hi *=1.8;
-    }
-    elsif ( $soratio >= 0.99 && $spam > 1.0 ) {
-      $hi *= 1.5; # let good rules be larger if they want to
-    }
-
-    $lo =	($soratio == 1) ? $hi:
-    		($soratio >= 0.995 ) ? $hi/4.0 :
-    		($soratio >= 0.990 && $spam > 1.0) ? $hi/8.0 :
-		($soratio >= 0.900 && $spam > 10.0) ? $hi/24.0 :
-		0;
-
-    if ( $soratio <= 0.65 ) { # auto-disable bad rules
-      ($lo,$hi) = (0,0);
-    }
-  }
-  else { # rule that has bad nice setting
-    ($lo,$hi) = (0,0);
-  }
-  $mutable = 0 if ( $hi == $lo );
-
-  printf OUT ("%3.1f %3.1f $mutable $test\n", $lo, $hi);
-}
-close OUT;
-exit;
-
-sub sliding_window_ratio_to_range {
-  my $ratio = shift;
-  my $lo = -$sliding_window_limits + ($sliding_window_size * $ratio);
-  my $hi = +$sliding_window_limits - ($sliding_window_size * (1-$ratio));
-  if ($lo > $hi) { # ???
-    ($lo,$hi) = ($hi,$lo);
-  }
-  ($lo, $hi);
-}
-
-sub shrinking_window_ratio_to_range {
-  my $ratio = shift;
-  my $is_nice = 0;
-  my $adjusted = ($ratio -.5) * 2;      # adj [0,1] to [-1,1]
-  if ($adjusted < 0) { $is_nice = 1; $adjusted = -$adjusted; }
-
-#$adjusted /= 1.5 if ( $ratio < 0.95 && $ratio > 0.15 ); # tvd
-
-  my $lower = $shrinking_window_lower_base 
-                        + ($shrinking_window_lower_range * $adjusted);
-  my $range = $shrinking_window_size_base 
-                        + ($shrinking_window_size_range * $adjusted);
-  my $lo = $lower;
-  my $hi = $lower + $range;
-  if ($is_nice) {
-    my $tmp = $hi; $hi = -$lo; $lo = -$tmp;
-  }
-  if ($lo > $hi) { # ???
-    ($lo,$hi) = ($hi,$lo);
-  }
-
-  ($lo, $hi);
-}
-
Index: masses/find-extremes
===================================================================
--- masses/find-extremes	(revision 20231)
+++ masses/find-extremes	(working copy)
@@ -17,38 +17,144 @@
 # limitations under the License.
 # </@LICENSE>
 
-use Getopt::Std;
-getopts("l:L:h");
 
+use FindBin;
+use lib "$FindBin::Bin/../lib";
+use Mail::SpamAssassin::Masses;
+use Getopt::Long qw(:config bundling auto_help);
+use Pod::Usage;
+use strict;
+use warnings;
+
 use vars qw {
-  $opt_h $opt_l $opt_L
+$opt_c $opt_s $opt_l $opt_L $opt_inclang
 };
 
-sub usage {
-  die "find-extremes [-l LC] [-L LC] [spam log] [nonspam log]
+GetOptions("c|cffile=s@" => \$opt_c,
+           "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
+           "l|logfile=s" => \$opt_l,
+           "L|language=s" => \$opt_L,
+           "include-language=s" => \$opt_inclang);
 
-    -l LC  also print language specific rules for lang code LC (or 'all')
-    -L LC  only print language specific rules for lang code LC (or 'all')
 
-    options -l and -L are mutually exclusive.
 
-    if either the spam or and nonspam logs are unspecified, the defaults
-    are \"spam.log\" and \"nonspam.log\" in the cwd.
+my $lower = 1;
+#$threshold = 5;
+my $higher = 9;
+my $min_expected = 2; # Should not be set to more than 5 or less than 2
 
-";
+
+=head1 NAME
+ 
+find-extremes - Determine which rules are most likely to cause false positives/negatives.
+ 
+=head1 SYNOPSIS
+ 
+hit-frequencies [options]
+ 
+ Options:
+    -c,--cffile=path      Use path as the rules directory
+    -s,--scoreset=n       Use scoreset n
+    -l,--logfile=file     Read in file instead of masses.log
+    -L,--language=lc      Only print language specific tests for specified lang code (try 'all')
+    --include-language=lc Also print language specific tests for specified lang code (try 'all')
+ 
+=head1 DESCRIPTION
+
+B<hit-frequencies> will read the mass-check log F<masses.log> or the
+log given by the B<--logfile> option. By default, B<hit-frequencies>
+will assume the proper values for B<--cffile> based on the header of
+the masses.log. The output will include the following columns:
+
+=over 4
+
+=item RULE
+
+=item CHISQUARE
+
+=item RATIO_FALSEPOS
+
+=item OVER_FALSEPOS
+
+=item FREQ_OVER
+
+=back
+
+=head1 BUGS
+
+This script may or may not work as designed - it probably needs some
+tweaking, and I probably introduced a bug into it while re-writing for
+the new Masses stuff. 
+
+=head1 NOTES
+
+This script is poorly documented. Patches welcome.
+
+=cut
+
+
+$opt_s = 0 unless defined $opt_s;
+
+my $ok_lang = lc ( $opt_inclang || $opt_L || '');
+$ok_lang = '.' if ($ok_lang eq 'all');
+
+my $greprules = sub {
+  my ($name, $rule) = @_;
+
+  return 0 if (($opt_L && !$rule->{lang}) ||
+           ($rule->{lang} &&
+            (!$ok_lang || $rule->{lang} !~ /^$ok_lang/i))); # Wrong language
+
+  return 0 if ($rule->{tflags} =~ /\bnet\b/);
+
+  return 1;
+
+};
+
+$opt_l ||= "masses.log";
+
+if (!$opt_c || !scalar(@$opt_c)) {
+    # Try to read this in from the log, if possible
+    open (IN, $opt_l) or die "Can't open $opt_l: $!";
+    my $files = 0; # are we in the files section?
+    while(<IN>) {
+        if (!$files) {
+            if (/^\# SVN revision:/) {
+                $opt_c = [ "$FindBin::Bin/../rules" ];
+                last;
+            } elsif (/^\# Using configuration:$/) {
+                $files = 1;
+            }
+        } elsif (/^\#\s+(.*)\s*$/) {
+            push (@$opt_c, $1);
+        } else {
+            # All done!
+            last;
+        }
+    }
+
+    foreach my $file (@$opt_c) {
+        die "Can't read $file" unless -r $file;
+    }
 }
 
-usage() if($opt_h || ($opt_l && $opt_L));
+my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
+                                               scoreset => $opt_s,
+                                               greprules => $greprules,
+                                               logfile => $opt_l,
+                                               nologs => 1});
 
-$lower = 1;
-#$threshold = 5;
-$higher = 9;
-$min_expected = 2; # Should not be set to more than 5 or less than 2
+$masses->readrules();
+$masses->readlogs();
 
-my %freq_spam = ();	# how often non-nice found in spam
+my $rules = $masses->get_rules_hash();
+my $logs = $masses->get_logs();
+
+my $num_spam = $masses->get_num_spam();
+my $num_ham = $masses->get_num_ham();
+
 my %freq_over_higher_falsepos = (); # how often non-nice found in ones over
                                     # higher threshold that are false positives
-my %freq_nonspam = ();	# how often nice found in nonspam
 my %freq_under_lower_falseneg = (); # how often nice found in ones under
                                     # lower threshold that are false negatives
 
@@ -59,43 +165,54 @@
 my %ratio_expected_falsepos = (); # ratio version of above
 my %ratio_expected_falseneg = (); # ditto
 
-my $num_spam = 0;
-my $num_nonspam = 0;
 my $num_over_higher_falsepos = 0;
 my $num_under_lower_falseneg = 0;
-my $ok_lang = '';
 
-readscores();
+my %chisquare = ( );
+my %prob = ( );
 
-$ok_lang = lc ($opt_l || $opt_L || '');
-if ($ok_lang eq 'all') { $ok_lang = '.'; }
 
-foreach my $key (keys %rules) {
+foreach my $key (keys %$rules) {
 
-  if ( ($opt_L && !$rules{$key}->{lang}) ||
-       ($rules{$key}->{lang} &&
-         (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i)
-     ) ) {
-    delete $rules{$key} ; next;
-  }
-
-  if ($rules{$key}->{tflags} =~ m/net/) {
-    delete $rules{$key};
-    next;
-  }
-  if ($rules{$key}->{tflags} !~ m/userconf/) {
-    if ($rules{$key}->{tflags} =~ m/nice/) {
-      $freq_nonspam{$key} = 0;
+  if ($rules->{$key}->{tflags} !~ /\buserconf\b/) {
+    if ($rules->{$key}->{tflags} =~ m/nice/) {
       $freq_under_lower_falseneg{$key} = 0;
     } else {
-      $freq_spam{$key} = 0;
       $freq_over_higher_falsepos{$key} = 0;
     }
   }
+
 }
 
-readlogs();
+foreach my $log (@$logs) {
 
+  if($log->{isspam}) {
+    # Also need to count plus_hits
+    my $plus_hits = 0;
+    foreach my $test (@{$log->{tests_hit}}) {
+      $plus_hits += $test->{score} if ($test->{score} > 0);
+    }
+
+    if(($log->{score} <= $lower) && $plus_hits && $plus_hits >= $lower) {
+      $num_under_lower_falseneg++;
+      foreach my $test (@{$log->{tests_hit}}) {
+	$num_under_lower_falseneg++;
+	$freq_under_lower_falseneg{$test->{name}}++ if exists $freq_under_lower_falseneg{$test->{name}};
+      }
+    }
+  }
+  else {
+    if($log->{score} > $higher) {
+      $num_over_higher_falsepos++;
+      foreach my $test (@{$log->{tests_hit}}) {
+	$num_over_higher_falsepos++;
+	$freq_over_higher_falsepos{$test->{name}}++ if exists $freq_over_higher_falsepos{$test->{name}};
+      }
+    }
+  }
+
+}
+
 unless (($num_over_higher_falsepos >= $min_expected)
 	&& ($num_under_lower_falseneg >= $min_expected)) {
   die "Insufficient extremes in dataset (" . $num_over_higher_falsepos .
@@ -119,12 +236,13 @@
 }
 
 my $ratio_falsepos = $num_over_higher_falsepos/$num_spam;
-my $ratio_falseneg = $num_under_lower_falseneg/$num_nonspam;
+my $ratio_falseneg = $num_under_lower_falseneg/$num_ham;
 
 my $skipped_non_nice = 0;
 
-foreach $rule (keys %freq_spam) {
-  my $expected = $freq_spam{$rule}*$ratio_falsepos;
+# non-nice rules
+foreach my $rule (keys %freq_over_higher_falsepos) {
+  my $expected = $rules->{$rule}->{freq_spam}*$ratio_falsepos;
   if ($expected <= $min_expected) {
     $skipped_non_nice++;
     next;
@@ -136,7 +254,7 @@
    $freq_over_higher_falsepos{$rule}/$expected;
   ($chisquare{$rule},$prob{$rule}) =
    chisquare($num_spam,$num_over_higher_falsepos,
-	     $freq_spam{$rule},$freq_over_higher_falsepos{$rule});
+	     $rules->{$rule}->{freq_spam},$freq_over_higher_falsepos{$rule});
   if ($freq_over_higher_falsepos{$rule} < $expected) {
     $chisquare{$rule} *= -1;
   }
@@ -146,8 +264,9 @@
 
 my $skipped_nice = 0;
 
-foreach $rule (keys %freq_nonspam) {
-  my $expected = $freq_nonspam{$rule}*$ratio_falseneg;
+# nice rules
+foreach my $rule (keys %freq_under_lower_falseneg) {
+  my $expected = $rules->{$rule}->{freq_ham}*$ratio_falseneg;
   if ($expected <= $min_expected) {
     $skipped_nice++;
     next;
@@ -158,8 +277,8 @@
   $ratio_expected_falseneg{$rule} =
    $freq_under_lower_falseneg{$rule}/$expected;
   ($chisquare{$rule},$prob{$rule}) =
-   chisquare($num_nonspam,$num_under_lower_falseneg,
-	     $freq_nonspam{$rule},$freq_under_lower_falseneg{$rule});
+   chisquare($num_ham,$num_under_lower_falseneg,
+	     $rules->{$rule}->{freq_ham},$freq_under_lower_falseneg{$rule});
   if ($freq_under_lower_falseneg{$rule} < $expected) {
     $chisquare{$rule} *= -1;
   }
@@ -167,8 +286,12 @@
 
 warn "Skipped nice: $skipped_nice\n";
 
-@rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos);
+# The rest is copied verbatim from before - its complicated and not
+# commented and should work unchanged except for the freq_spam and
+# freq_ham stuff and fixing some use strict stuff
 
+my @rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos);
+
 if (scalar(@rules_falsepos)) {
   print "RULE\t\tCHISQUARE\tRATIO_FALSEPOS\tOVER_FALSEPOS\tFREQ_OVER ($num_over_higher_falsepos)\n";
   my(@rules_falsepos_bad) =
@@ -183,7 +306,7 @@
 	   $over_expected_falsepos{$a}) ||
 	    ($freq_over_higher_falsepos{$b} <=>
 	     $freq_over_higher_falsepos{$a})} (@rules_falsepos_bad);
-    foreach $rule (@rules_falsepos_bad) {
+    foreach my $rule (@rules_falsepos_bad) {
       print $rule . "\t" . $prob{$rule} . "\t" .
        $ratio_expected_falsepos{$rule} . "\t" .
 	$over_expected_falsepos{$rule} . "\t" .
@@ -199,9 +322,9 @@
        ($chisquare{$a} <=> $chisquare{$b}) ||
 	($ratio_expected_falsepos{$a} <=>
 	 $ratio_expected_falsepos{$b}) ||
-	  ($freq_spam{$b} <=>
-	   $freq_spam{$a})} (@rules_falsepos_good);
-    foreach $rule (@rules_falsepos_good) {
+	  ($rules->{$b}->{freq_spam} <=>
+	   $rules->{$a}->{freq_spam})} (@rules_falsepos_good);
+    foreach my $rule (@rules_falsepos_good) {
       print $rule . "\t" . $prob{$rule} . "\t" .
        $ratio_expected_falsepos{$rule} . "\t" .
 	$over_expected_falsepos{$rule} . "\t" .
@@ -212,7 +335,7 @@
   warn "No over-falsepos to print\n";
 }
 
-@rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg);
+my @rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg);
 
 if (scalar(@rules_falseneg)) {
   print "RULE\t\tCHISQUARE\tRATIO_FALSENEG\tOVER_FALSENEG\tFREQ_UNDER ($num_under_lower_falseneg)\n";
@@ -228,7 +351,7 @@
 	   $over_expected_falseneg{$a}) ||
 	    ($freq_under_lower_falseneg{$b} <=>
 	     $freq_under_lower_falseneg{$a})} (@rules_falseneg_bad);
-    foreach $rule (@rules_falseneg_bad) {
+    foreach my $rule (@rules_falseneg_bad) {
       print $rule . "\t" . $prob{$rule} . "\t" .
        $ratio_expected_falseneg{$rule} . "\t" .
 	$over_expected_falseneg{$rule} . "\t" .
@@ -244,9 +367,9 @@
        ($chisquare{$a} <=> $chisquare{$b}) ||
 	($ratio_expected_falseneg{$a} <=>
 	 $ratio_expected_falseneg{$b}) ||
-	  ($freq_spam{$b} <=>
-	   $freq_spam{$a})} (@rules_falseneg_good);
-    foreach $rule (@rules_falseneg_good) {
+	  ($rules->{$b}->{freq_ham} <=>
+	   $rules->{$a}->{freq_ham})} (@rules_falseneg_good);
+    foreach my $rule (@rules_falseneg_good) {
       print $rule . "\t" . $prob{$rule} . "\t" .
        $ratio_expected_falseneg{$rule} . "\t" .
 	$over_expected_falseneg{$rule} . "\t" .
@@ -258,97 +381,3 @@
 }
 
 exit;
-
-sub readlogs {
-  my $spam = $ARGV[0] || "spam.log";
-  my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log");
-
-
-  (open(NONSPAM,$nonspam)) ||
-   (die "Couldn't open file '$nonspam': $!; stopped");
-
-  while (defined($line = <NONSPAM>)) {
-    if ($line =~ m/^\s*\#/) {
-      next;
-    } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) {
-      my $tests = $1;
-      my $hits = 0;
-      my(@tests) = ();
-      foreach $test (grep {length($_)} (split(/,+/,$tests))) {
-	if (exists($rules{$test})) {
-	  push @tests, $test;
-	  $hits += $rules{$test}->{score};
-	}
-      }
-      
-      if (scalar(@tests)) {
-	$num_nonspam++;
-	foreach $test (grep {exists($freq_nonspam{$_})} (@tests)) {
-	  $freq_nonspam{$test}++;
-	}
-	if ($hits >= $higher) {
-	  $num_over_higher_falsepos++;
-	  foreach $test (grep
-			 {exists($freq_over_higher_falsepos{$_})} (@tests)) {
-	    $freq_over_higher_falsepos{$test}++;
-	  }
-	}
-      }
-    } elsif ($line =~ m/\S/) {
-      chomp($line);
-      warn "Can't interpret line '$line'; skipping";
-    }
-  }
-
-  close(NONSPAM);
-
-  (open(SPAM,$spam)) || (die "Couldn't open file '$spam': $!; stopped");
-
-  while (defined($line = <SPAM>)) {
-    if ($line =~ m/^\s*\#/) {
-      next;
-    } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) {
-      my $tests = $1;
-      my $hits = 0;
-      my $plus_hits = 0;
-      my(@tests) = ();
-      foreach $test (grep {length($_)} (split(/,+/,$tests))) {
-	if (exists($rules{$test})) {
-	  push @tests, $test;
-	  $hits += $rules{$test}->{score};
-	  if ($rules{$test}->{score} > 0) {
-	    $plus_hits += $rules{$test}->{score};
-	  }
-	}
-      }
-      
-      if (scalar(@tests)) {
-	$num_spam++;
-	foreach $test (grep {exists($freq_spam{$_})} (@tests)) {
-	  $freq_spam{$test}++;
-	}
-	if (($hits <= $lower) && $plus_hits &&
-	    ($plus_hits >= $lower)) {
-	  $num_under_lower_falseneg++;
-	  foreach $test (grep
-			 {exists($freq_under_lower_falseneg{$_})} (@tests)) {
-	    $freq_under_lower_falseneg{$test}++;
-	  }
-	}
-      }
-    } elsif ($line =~ m/\S/) {
-      chomp($line);
-      warn "Can't interpret line '$line'; skipping";
-    }
-  }
-
-  close(SPAM);
-}
-
-
-sub readscores {
-  system ("./parse-rules-for-masses") and
-   die "Couldn't do parse-rules-for-masses: $?; stopped";
-  require "./tmp/rules.pl";
-}
-
Index: masses/tenpass/10pass-compute-tcr
===================================================================
--- masses/tenpass/10pass-compute-tcr	(revision 20231)
+++ masses/tenpass/10pass-compute-tcr	(working copy)
@@ -6,12 +6,12 @@
 do
   mkdir tmp/10passrules > /dev/null 2>&1
   cp ../rules/[0-9]*.cf tmp/10passrules
-  ./rewrite-cf-with-new-scores $SCORESET ../rules/50_scores.cf \
-	tenpass_results/scores.$run > tmp/10passrules/50_scores.cf
+  ./rewrite-cf-with-new-scores -s $SCORESET --old=../rules/50_scores.cf \
+	--new=tenpass_results/scores.$run --out=tmp/10passrules/50_scores.cf \
+        --cffile=../rules
 
   ./fp-fn-statistics --cffile=tmp/10passrules \
-	--spam=tenpass_results/spam.log.$run \
-	--nonspam=tenpass_results/ham.log.$run > tmp/stats
+	--logfile=tenpass_results/masses.log.$run > tmp/stats
 
   grep TCR: tmp/stats
 done
Index: masses/tenpass/10pass-run
===================================================================
--- masses/tenpass/10pass-run	(revision 20231)
+++ masses/tenpass/10pass-run	(working copy)
@@ -1,13 +1,10 @@
 #!/bin/sh
 
 # change these!
-NSBASE=ham-logs
-SPBASE=spam-logs
-SCORESET="0"
+BASE=logs/
 
 passes="1 2 3 4 5 6 7 8 9 10"
-mkdir -p tenpass_results
-mkdir -p ORIG
+mkdir tenpass_results
 
 > make.output
 
@@ -17,28 +14,27 @@
   echo "Training for corpus $id..."
   pwd; date
 
-  > ORIG/ham-set$SCORESET.log
-  > ORIG/spam-set$SCORESET.log
-
+  > masses.log
   echo -n "(using corpora blocks: "
   for notid in $passes ; do
     if [ "$notid" != "$id" ] ; then
       echo -n "$notid "
-      cat $NSBASE/split-$notid.log >> ORIG/ham-set$SCORESET.log
-      cat $SPBASE/split-$notid.log >> ORIG/spam-set$SCORESET.log
+      cat $BASE/split-$notid.log >> masses.log
     fi
   done
   echo "for training)"
 
   make clean >> make.output
-  make >> make.output 2>&1
-  ./runGA
-  pwd
-  date
+  make perceptron 2>&1 >> make.output
+  ./perceptron
+  pwd; date
 
   echo "Saving test data for corpus $id..."
 
-  cp $NSBASE/split-$id.log tenpass_results/ham.log.$id
-  cp $SPBASE/split-$id.log tenpass_results/spam.log.$id
-  cp gen-set$SCORESET.scores tenpass_results/scores.$id
+  cp $BASE/split-$id.log tenpass_results/masses.log.$id
+
+  cp perceptron.scores tenpass_results/scores.$id
+
 done
+
+