Index: lib/Mail/SpamAssassin/Masses.pm
===================================================================
--- lib/Mail/SpamAssassin/Masses.pm	(revision 0)
+++ lib/Mail/SpamAssassin/Masses.pm	(revision 0)
@@ -0,0 +1,833 @@
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+
+=head1 NAME
+
+Mail::SpamAssassin::Masses - Interface for reading and parsing rules
+and mass-check logs for SpamAssassin
+
+=head1 SYNOPSIS
+
+  my $parser = Mail::SpamAssassin::Masses->new();
+  my $rules = $parser->readrules();
+  my $logs = $parser->readlogs();
+
+  foreach my $test (keys %$rules) {
+    if ($rules->{$test}->{score} > 1) {
+      ...
+    }
+
+=head1 DESCRIPTION
+
+Mail::SpamAssassin::Masses is a module to simplify the many scripts
+that used to make up the SpamAssassin re-scoring process. By
+consolidating all the shared code in one module, the scripts can be
+simplified and require fewer temporary files.
+
+=head1 METHODS
+
+=over 4
+
+=cut
+
+package Mail::SpamAssassin::Masses;
+
+use strict;
+use warnings;
+use Carp;
+
+=item $parser = Mail::SpamAssassin::Masses->new( [ { opt => val, ... } ] );
+
+Construct a new Mail::SpamAssassin::Masses object. You may pass the
+following attribute-value pairs to the constructor.
+
+=over 4
+
+=item rulesdir
+
+The directory containing rules. If multiple directories are desired,
+an anonymous array should be passed.
+
+=item scoreset
+
+Scoreset to deal with.
+
+=item logfile
+
+Filename of mass-check log.
+
+=item falses
+
+Also count frequencies for false positives and false negatives from
+the logs.
+
+=item falsesonly
+
+Only count false positives and false negatives.
+
+=item greprule
+
+Coderef that is passed a rule name and a hash ref with the entries
+containing info about the rule. If the sub returns false, it is skipped.
+
+=item greplog
+
+Coderef that is passed a raw log entry. If it returns false, the entry
+is skipped.
+
+=item sliding_window
+
+Use a sliding window for score ranges rather than a shrinking window.
+
+=item nologs
+
+Save memory by not saving the individual log results, just the
+aggregate totals
+
+=back
+
+=cut
+
+sub new {
+
+  my $class = shift;
+  $class = ref($class) || $class;
+
+  my $self = shift;
+  if (!defined $self){
+    $self = { };
+  }
+
+  $self->{scoreset} ||= 0;
+  $self->{rulesdir} ||= '';
+  $self->{logfile} ||= "masses.log";
+
+  bless($self, $class);
+
+  return $self;
+
+}
+
+=item $parser->readrules()
+
+Read and parse the rules from the directory specified as
+C<rulesdir>. This loads the following keys and values into the hash
+entry representing the rules (see below).
+
+=over 4
+
+=item name
+
+Contains the rule's name.
+
+=item score
+
+Contains the rule's score.
+
+=item type
+
+Contains the rule's type (header, body, uri, etc.)
+
+=item tflags
+
+Contains the rules tflags (nice, autolearn, etc.) as specified in the config file.
+
+=item lang
+
+Set to the value of C<lang> for language-specific tests.
+
+=item issubrule
+
+Set to true if the rules is a sub-rule, (i.e. it starts with
+__). Otherwise, undefined.
+
+=item isnice
+
+This key exists and is true if the rule is nice (i.e. with a score
+that can be below zero).
+
+=item describe
+
+Set to the rule's description, in English, or in the rule's language.
+
+=back
+
+There may be more values once C<readlogs()> is run.
+
+=cut
+
+
+sub readrules {
+
+  my $self = shift;
+
+  $self->{rules} ||= { };
+  my $rules = $self->{rules}; # $rules is a reference to the anon hash
+
+  my @dirs = ref($self->{rulesdir}) ? @{$self->{rulesdir}} : $self->{rulesdir};
+
+  foreach my $indir (@dirs) {
+    my @files = glob("$indir/*.cf"); # no reason to only do numbered files
+
+    foreach my $file (@files) {
+      open (IN, "<$file") || croak("Can't open $file, $!");
+      while(<IN>) {
+        s/#.*$//g;
+        s/^\s+//;
+        s/\s+$//;
+        next if /^$/;
+
+        my $lang = '';
+        if (s/^lang\s+(\S+)\s+//) {
+          $lang = lc $1;
+        }
+
+        if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) {
+          my $type = $1;
+          my $name = $2;
+
+          $rules->{$name} ||= { };
+	  $rules->{$name}->{name} = $name;
+          $rules->{$name}->{type} = $type;
+          $rules->{$name}->{lang} = $lang if $lang;
+          $rules->{$name}->{tflags} = '';
+
+          if ($name =~ /^__/) {
+	    $rules->{$name}->{issubrule} = '1';
+	  }
+
+        } elsif (/^describe\s+(\S+)\s+(.+)$/) {
+
+          # Let's get description in english, por favor -- unless the rule isn't english
+
+	  next if ($lang && (!$rules->{$1}->{lang} || $rules->{$1}->{lang} ne $lang));
+
+          $rules->{$1} ||= { };
+          $rules->{$1}->{describe} = $2;
+
+        } elsif (/^tflags\s+(\S+)\s+(.+)$/) {
+	  my $name = $1;
+          $rules->{$name} ||= { };
+          $rules->{$name}->{tflags} = $2;
+	  if ($2 =~ /nice/) {
+	    $rules->{$name}->{isnice} = 1;
+	  }
+        } elsif (/^score\s+(\S+)\s+(.+)$/) {
+          my($name,$score) = ($1,$2);
+          $rules->{$name} ||= { };
+          if ( $score =~ /\s/ ) { # there are multiple scores
+            ($score) = (split(/\s+/,$score))[$self->{scoreset}];
+          }
+          $rules->{$name}->{score} = $score;
+        }
+      }
+      close IN;
+    }
+  }
+  foreach my $rule (keys %{$rules}) {
+    if (!defined $rules->{$rule}->{type}) {
+      delete $rules->{$rule};   # no rule definition -> no rule
+      next;
+    }
+
+    if (!defined $rules->{$rule}->{score}) {
+      my $def = 1.0;
+      if ($rule =~ /^T_/) { $def = 0.01; }
+
+      if ($rules->{$rule}->{isnice}) {
+        $rules->{$rule}->{score} = -$def;
+      } else {
+        $rules->{$rule}->{score} = $def;
+      }
+    }
+
+    if ($self->{greprules} && !&{$self->{greprules}}($rule, $rules->{$rule}))
+    {
+      delete $rules->{$rule};
+      next;
+    }
+
+  }
+
+  $self->{_readrules} = 1;
+}
+
+=item $parser->readlogs()
+
+Read and parse logs from C<logsdir>. This will create the anonymous
+array of hashes referred to by C<$parser->{logs}>, with the following
+keys:
+
+=over 4
+
+=item isspam
+
+True if the message is spam. False or undefined otherwise.
+
+=item isfalse
+
+True if the message was a false negative or positive.
+
+=item tests_hit
+
+Array reference containing references to the hash representing each
+rule hit.
+
+=item score
+
+Score the message received (under current scores).
+
+=back
+
+In addition, this method adds the following keys to the rule
+information in C<$parser->{rules}>.
+
+=over 4
+
+=item freq_spam
+
+Frequency hit in spam.
+
+=item freq_ham
+
+Frequency hit in ham.
+
+=item freq_fp
+
+Frequency in false positives.
+
+=item freq_fn
+
+Frequency in false negatives.
+
+=back
+
+Also, sets C<$parser->{num_spam}> and C<$parser->{num_ham}> to the number of
+spam logs read and the number of ham logs read, respectively.
+
+=cut
+
+sub readlogs {
+
+  my $self = shift;
+
+  if (!$self->{_readrules}) {
+    # need to read scores first!
+    $self->readrules();
+  }
+
+  my $rules = $self->{rules}; # copy the ref, shorthand
+
+  my $logs;
+  if (! $self->{nologs}) {
+    $self->{logs} ||= [ ];
+    $logs = $self->{logs};
+  }
+
+
+  my ($num_spam, $num_ham, $count, $num_fp, $num_fn);
+  $num_spam = $num_ham = $count = $num_fp = $num_fn = 0;
+
+  # First, initialize stuff
+  foreach my $rule (values %{$self->{rules}}) {
+    $rule->{freq_spam} ||= 0;
+    $rule->{freq_ham} ||= 0;
+
+    if($self->{falses}) {
+      $rule->{freq_fp} ||= 0;
+      $rule->{freq_fn} ||= 0;
+    }
+
+  }
+
+  my $file = $self->{logfile};
+  open (IN, "<$file");
+
+  while (<IN>) {
+    next if /^\#/;
+    next if /^$/;
+    if($_ !~ /^(.)\s+(.)\s+-?[\d.]+\s+\S+(\s+\S+\s+)/) { warn "bad line: $_"; next; }
+
+    if ($self->{greplogs} && !&{$self->{greplogs}}($_)) {
+      next;
+    }
+
+    my $manual = $1;
+    my $result = $2;
+    $_ = $3;
+    s/(?:bayes|time)=\S+//;
+    s/,,+/,/g;
+    s/^\s+//;
+    s/\s+$//;
+
+
+    if ($manual ne $result) {
+      $self->{isfalse} = 1;
+    }
+    elsif ($self->{falsesonly}) {
+      next;
+    }
+
+    if ($manual eq "s") {
+      $num_spam++;
+      $logs->[$count]->{isspam} = 1 unless $self->{nologs};
+      $num_fn++ if $result eq "h";
+    } else {
+      $num_ham++;
+      $num_fp++ if $result eq "s";
+    }
+
+    my @tests = ();
+    my $score = 0;
+    foreach my $tst (split (/,/, $_)) {
+      next if ($tst eq '');
+
+      # Don't count non-existant rules
+      # (Could happen with greprules)
+      next if ( !$rules->{$tst} || !$rules->{$tst}->{type} );
+
+      if ($manual eq "s") {
+	  $rules->{$tst}->{freq_spam}++;
+	  $rules->{$tst}->{freq_fn}++ if ($self->{falses} && $result eq "h");
+      }
+      else {
+	  $rules->{$tst}->{freq_ham}++;
+	  $rules->{$tst}->{freq_fp}++ if ($self->{falses} && $result eq "s");
+      }
+
+      $score += $rules->{$tst}->{score};
+
+      push (@tests, $rules->{$tst}) unless $self->{nologs};
+    }
+
+    $logs->[$count]->{tests_hit} = \@tests unless $self->{nologs};
+    $logs->[$count]->{score} = $score;
+
+    $count++;
+  }
+  close IN;
+
+  $self->{num_spam} = $num_spam;
+  $self->{num_ham} = $num_ham;
+  if ($self->{falses}) {
+    $self->{num_fn} = $num_fn;
+    $self->{num_fp} = $num_fp;
+  }
+
+  $self->{_readlogs} = 1; # Done reading logs
+
+}
+
+=item $parser->do_statistics();
+
+Calculate the S/O ratio and the rank for each test.
+
+This adds the following keys to the rules hashes.
+
+=over 4
+
+=item spam_percent
+
+Percentage of spam messages hit.
+
+=item ham_percent
+
+Percentage of ham messages hit.
+
+=item soratio
+
+S/O ratio -- percentage of spam messages hit divided by total
+percentage of messages hit.
+
+=back
+
+=cut
+
+sub do_statistics {
+  my $self = shift;
+
+  if (! $self->{_readlogs} ) {
+    $self->readlogs();
+  }
+
+  my $rank_hi=0;
+  my $rank_lo=999999;
+
+  foreach my $rule (values %{$self->{rules}}) {
+
+    if (!$rule->{freq_spam}) {
+      $rule->{spam_percent} = 0;
+    } else {
+      $rule->{spam_percent} = $rule->{freq_spam} / $self->{num_spam} * 100.0;
+    }
+
+    if (!$rule->{freq_ham}) {
+      $rule->{ham_percent} = 0;
+    } else {
+      $rule->{ham_percent} = $rule->{freq_ham} / $self->{num_ham} * 100.0;
+    }
+
+    if (!$rule->{freq_spam} && !$rule->{freq_ham}) {
+      $rule->{soratio} = 0.5;
+      next;
+    }
+
+    $rule->{soratio} = $rule->{spam_percent} / ($rule->{spam_percent} + $rule->{ham_percent});
+
+  }
+
+  $self->{_statistics} = 1;
+
+}
+
+=item $parser->do_rank();
+
+Calculates the ranking for each rule and stores this in the
+appropriate key.
+
+=over 4
+
+=item rank
+
+"Rank" of the rule. High numbers are good, low are bad.
+
+=back
+
+=cut
+
+sub do_rank {
+
+  my $self = shift;
+
+  if (! $self->{_statistics} ) {
+    $self->do_statistics();
+  }
+
+  my $rank_hi = 0;
+  my $rank_lo = 9999999;
+
+  my %unwanted;
+  my %wanted;
+  my %wranks;
+  my %uranks;
+  my $rules = $self->{rules};
+
+
+  foreach my $rule (values %{$self->{rules}}) {
+
+    $wanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_ham} : $rule->{freq_spam};
+    $unwanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_spam} : $rule->{freq_ham};
+
+    $wranks{$wanted{$rule->{name}}} = 1;
+    $uranks{$unwanted{$rule->{name}}} = 1;
+
+  }
+
+  my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
+  my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;
+
+  # first half of ranking is the wanted rank
+  my $position = 0;
+  my $last = undef;
+
+  foreach my $test (@wanted) {
+    $position++ if defined $last && $last != $wanted{$test};
+    $rules->{$test}->{rank} += $position;
+    $last = $wanted{$test};
+  }
+
+  # second half is the unwanted rank
+  $position = 0;
+  $last = undef;
+  my $normalize = (scalar keys %wranks) / (scalar keys %uranks);
+
+  foreach my $test (@unwanted) {
+    $position++ if defined $last && $last != $unwanted{$test};
+    $rules->{$test}->{rank} += ($position * $normalize);
+    $last = $unwanted{$test};
+    $rank_hi = $rules->{$test}->{rank} if ($rules->{$test}->{rank} > $rank_hi);
+    $rank_lo = $rules->{$test}->{rank} if ($rules->{$test}->{rank} < $rank_lo);
+  }
+
+  $rank_hi = $rank_hi - $rank_lo;
+  foreach my $rule (values %{$rules}) {
+    $rule->{rank} = ($rank_hi == 0) ? 0.001 : (($rule->{rank} - $rank_lo)/ $rank_hi);
+  }
+
+  $self->{_rank} = 1;
+}
+
+=item $parser->get_rules_array();
+
+Returns a reference to an array of hash references. The values of
+these hash have keys as listed above.
+
+=cut
+
+sub get_rules_array {
+  my $self = shift;
+  return [ values %{$self->{rules}} ];
+}
+
+=item $parser->get_rules_hash();
+
+Returns a reference to a hash with rule names as keys and hash
+references as values. The values of these hash have keys as listed
+above.
+
+=cut
+
+sub get_rules_hash {
+  my $self = shift;
+  return $self->{rules};
+}
+
+=item $parser->get_logs();
+
+Returns a reference to the array containing log entries, in the form
+of anonymous hashes with keys as described above.
+
+=cut
+
+sub get_logs {
+  my $self = shift;
+  return $self->{logs};
+}
+
+=item $parser->get_num_ham();
+
+Returns number of ham logs read.
+
+=cut
+
+sub get_num_ham {
+  my $self = shift;
+  return $self->{num_ham};
+}
+
+=item $parser->get_num_spam();
+
+Returns number of spam logs read.
+
+=cut
+
+sub get_num_spam {
+  my $self = shift;
+  return $self->{num_spam};
+}
+
+=item $parser->do_score_ranges();
+
+Figure out range in which score can be set based on the soratio, etc.
+
+This is necessary so that the perceptron doesn't set silly
+scores. (This may not be as much of a problem as it was with the old
+GA.)
+
+This adds the following keys to the rules hashes:
+
+=over 4
+
+=item ismutable
+
+Determines whether the perceptron can select a score for this test.
+
+=item range_lo
+
+Determines the lowest score the perceptron can set.
+
+=item range_hi
+
+Determines the highest score the perceptron can set.
+
+=cut
+
+sub do_score_ranges() {
+
+  my $self = shift;
+
+  if ( !$self->{_statistics} ) {
+    $self->do_statistics();
+  }
+  if ( !$self->{_rank} ) {
+    $self->do_rank();
+  }
+
+  foreach my $rule (values %{$self->{rules}}) {
+
+    my ($rank, $lo, $hi);
+
+    $rank = $rule->{rank};
+
+    # Get rid of rules that don't hit -- and disable completely.
+    if ($rule->{spam_percent} + $rule->{ham_percent} < 0.01 ||
+	$rule->{score} == 0) {
+
+      $rule->{ismutable} = 0;
+      $rule->{range_lo} = $rule->{range_hi} = 0;
+      next;
+
+    }
+
+    # next: get rid of tests that don't apply in this scoreset
+    # or are userconf -- set ismutable to 0, but keep the score
+    if ($rule->{tflags} =~ /\buserconf\b/ ||
+	(($self->{scoreset} % 2) == 0 && $rule->{tflags} =~/\bnet\b/)) {
+
+      $rule->{ismutable} = 0;
+      $rule->{range_lo} = $rule->{range_hi} = $rule->{score};
+      next;
+
+    }
+
+
+    # Normal rules:
+
+    # This seems to convert from [-1,1] to [0,1] but we're already in
+    # [0,1] space - Is this right?
+
+    # The current way ranks are calculated, > 0.5 and < 0.5 have no
+    # special meaning
+
+#      # 0.0 = best nice, 1.0 = best nonnice
+#      if ($rule->{isnice}) {
+#        $rank = .5 - ($rank / 2);
+#      } else {
+#        $rank = .5 + ($rank / 2);
+#      }
+
+    # ranks are really meant to be used as human readable - not as
+    # tools to choose score range - use soratio instead!
+
+    if ($self->{use_sliding_window}) {
+      ($lo, $hi) = $self->sliding_window_ratio_to_range($rule->{soratio});
+    } else {
+      ($lo, $hi) = $self->shrinking_window_ratio_to_range($rule->{soratio});
+    }
+
+    # Modify good rules to be lower
+    if ($rule->{isnice}) {
+      if ($rule->{tflags} =~ /\blearn\b/) { # learn rules should get
+                                            # higher scores (-5.4)
+	$lo *= 1.8;
+      }
+      elsif ( $rule->{soratio} <= 0.05 && $rule->{ham_percent} > 0.5) {
+	$lo *= 1.5;
+      }
+
+      # argh, ugly... but i'm copying it whole...
+      $hi =	($rule->{soratio} == 0) ? $lo :
+    		($rule->{soratio} <= 0.005 ) ? $lo/1.1 :
+    		($rule->{soratio} <= 0.010 && $rule->{ham_percent} > 0.2) ? $lo/2.0 :
+		($rule->{soratio} <= 0.025 && $rule->{ham_percent} > 1.5) ? $lo/10.0 :
+		0;
+
+      if ($rule->{soratio} >= 0.35 ) {
+	($lo, $hi) = (0,0);
+      }
+    }
+    else { # Make non-nice rules have higher scores if they're good
+      if ($rule->{tflags} =~ /\blearn\b/ ) {
+	$hi *= 1.8;
+      }
+      elsif ( $rule->{soratio} >= 0.99 && $rule->{spam_percent} > 1.0) {
+	$hi *= 1.5;
+      }
+
+      $lo =	($rule->{soratio} == 1) ? $hi:
+    		($rule->{soratio} >= 0.995 ) ? $hi/4.0 :
+    		($rule->{soratio} >= 0.990 && $rule->{spam_percent} > 1.0) ? $hi/8.0 :
+		($rule->{soratio} >= 0.900 && $rule->{spam_percent} > 10.0) ? $hi/24.0 :
+		0;
+
+      if ($rule->{soratio} <= 0.65 ) { # auto-disable bad rules
+	($lo, $hi) = (0,0);
+      }
+    }
+
+
+    # Some sanity checking
+    if($hi < $lo) {
+      ($lo, $hi) = ($hi, $lo);
+    }
+
+
+    $rule->{ismutable} = ($lo == $hi) ? 0 : 1;
+    $rule->{range_lo} = $lo;
+    $rule->{range_hi} = $hi;
+
+  }
+}
+
+sub sliding_window_ratio_to_range {
+  my ($self, $ratio) = @_;
+
+  # (rough) graphic demo of this algorithm:
+  # 0.0  = -limit [......] 0 ........ limit
+  # 0.25 = -limit ..[..... 0 .]...... limit
+  # 0.5  = -limit ....[... 0 ...].... limit
+  # 0.75 = -limit ......[. 0 .....].. limit
+  # 1.0  = -limit ........ 0 [......] limit
+  my $sliding_window_limits = 4.8; # limits = [-$range, +$range]
+  my $sliding_window_size =   5.5; # scores have this range within limits
+
+  my $lo = -$sliding_window_limits + ($sliding_window_size * $ratio);
+  my $hi = +$sliding_window_limits - ($sliding_window_size * (1-$ratio));
+  if ($lo > $hi) {
+    ($lo,$hi) = ($hi,$lo);
+  }
+  ($lo, $hi);
+
+}
+sub shrinking_window_ratio_to_range {
+  my ($self, $ratio) = @_;
+
+
+  # 0.0  = -limit [......] 0 ........ limit
+  # 0.25 = -limit ....[... 0 ]....... limit
+  # 0.5  = -limit ......[. 0 .]...... limit (note: tighter)
+  # 0.75 = -limit .......[ 0 ...].... limit
+  # 1.0  = -limit ........ 0 [......] limit
+  my $shrinking_window_lower_base =   0.00; 
+  my $shrinking_window_lower_range =  1.00; # *ratio, added to above
+  my $shrinking_window_size_base =    1.00;
+  my $shrinking_window_size_range =   1.00; # *ratio, added to above
+
+  my $is_nice = 0;
+  my $adjusted = ($ratio -.5) * 2;      # adj [0,1] to [-1,1]
+  if ($adjusted < 0) { $is_nice = 1; $adjusted = -$adjusted; }
+
+#$adjusted /= 1.5 if ( $ratio < 0.95 && $ratio > 0.15 ); # tvd
+
+  my $lower = $shrinking_window_lower_base 
+                        + ($shrinking_window_lower_range * $adjusted);
+  my $range = $shrinking_window_size_base 
+                        + ($shrinking_window_size_range * $adjusted);
+  my $lo = $lower;
+  my $hi = $lower + $range;
+  if ($is_nice) {
+    my $tmp = $hi; $hi = -$lo; $lo = -$tmp;
+  }
+  if ($lo > $hi) {
+    ($lo,$hi) = ($hi,$lo);
+  }
+
+  ($lo, $hi);
+}
+
+
+# Pacify perl
+1;
Index: hit-frequencies
===================================================================
--- hit-frequencies	(revision 10703)
+++ hit-frequencies	(working copy)
@@ -16,385 +16,252 @@
 # limitations under the License.
 # </@LICENSE>
 
+
 use FindBin;
-use Getopt::Std;
-getopts("fm:M:X:l:L:pxhc:at:s:i");
+use lib "$FindBin::Bin/../lib";
+use Mail::SpamAssassin::Masses;
+use Getopt::Long qw(:config bundling auto_help);
+use Pod::Usage;
+use strict;
+use warnings;
 
+
 use vars qw {
   $opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c
-  $opt_a $opt_t $opt_s $opt_i $sorting
+  $opt_a $opt_t $opt_s $opt_z $opt_inclang $opt_auto
 };
 
-sub usage {
-  die "hit-frequencies [-c rules dir] [-f] [-m RE] [-M RE] [-X RE] [-l LC]
-                [-s SC] [-a] [-p] [-x] [-i] [spam log] [ham log]
+GetOptions("c|cffile=s@" => \$opt_c,
+	   "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
+	   "l|logfile=s" => \$opt_l,
+	   "f|falses" => \$opt_f,
+	   "a|all" => \$opt_a,
+	   "p|percentages" => \$opt_p,
+	   "x|extended" => \$opt_x,
+	   "m|matchrule=s" => \$opt_m, #,
+	   "t|tflags=s" => \$opt_t,
+	   "M|matchlog=s" => \$opt_M,
+	   "X|excludelog=s" => \$opt_X,
+	   "L|language=s" => \$opt_L,
+	   "include-language=s" => \$opt_inclang);
 
-    -c p   use p as the rules directory
-    -f     falses. count only false-negative or false-positive matches
-    -m RE  print rules matching regular expression
-    -t RE  print rules with tflags matching regular expression
-    -M RE  only consider log entries matching regular expression
-    -X RE  don't consider log entries matching regular expression
-    -l LC  also print language specific rules for lang code LC (or 'all')
-    -L LC  only print language specific rules for lang code LC (or 'all')
-    -a     display all tests
-    -p     percentages. implies -x
-    -x     extended output, with S/O ratio and scores
-    -s SC  which scoreset to use
-    -i     use IG (information gain) for ranking
 
-    options -l and -L are mutually exclusive.
+=head1 NAME
 
-    options -M and -X are *not* mutually exclusive.
+hit-frequencies - Display statistics about tests hit by a mass-check run
 
-    if either the spam or and ham logs are unspecified, the defaults
-    are \"spam.log\" and \"ham.log\" in the cwd.
+=head1 SYNOPSIS
 
-";
-}
+hit-frequencies [options]
 
-usage() if($opt_h || ($opt_l && $opt_L));
+ Options:
+    -c,--cffile=path	  Use path as the rules directory
+    -s,--scoreset=n	  Use scoreset n
+    -l,--logfile=file	  Read in file instead of masses.log
+    -f			  Count only false-positives/false-negatives
+    -a			  Report all tests (including subrules)
+    -p			  Report percentages instead of raw hits
+    -x			  "Extended" output, include RANK, S/O and SCORE
+    -m,--matchrule=re     Print rules matching the regular expression
+    -t,--tflags=re	  Print only rules with tflags matching the regular expression
+    -M,--matchlog=re      Consider only logs matching the regular expression
+    -X,--excludelog=re	  Exclude logs matching this regular expression
+    -L,--language=lc	  Only print language specific tests for specified lang code (try 'all')
+    --include-language=lc Also print language specific tests for specified lang code (try 'all')
 
-if ($opt_p) {
-  $opt_x = 1;
-}
+=head1 DESCRIPTION
 
-$opt_s = 0 if ( !defined $opt_s );
+B<hit-frequencies> will read the mass-check log F<masses.log> or the
+log given by the B<--logfile> option. The output will contain a
+summary of the number of ham and spam messages and detailed statistics
+for each rule. By default, B<hit-frequencies> will try to guess the
+proper values for B<--cffile> based on the header of the
+masses.log. The output will include the following columns:
 
-my $cffile = $opt_c || "$FindBin::Bin/../rules";
+=over 4
 
-my %freq_spam = ();
-my %freq_ham = ();
-my $num_spam = 0;
-my $num_ham = 0;
-my %ranking = ();
-my $ok_lang = '';
+=item OVERALL
 
-readscores($cffile);
+Number of times (or percentage with B<-p>) the rule hit on
+all messages (spam or ham).
 
-$ok_lang = lc ($opt_l || $opt_L || '');
-if ($ok_lang eq 'all') { $ok_lang = '.'; }
+=item SPAM
 
-foreach my $key (keys %rules) {
+Number of times (or percentage with B<-p>) the rule hit on
+spam messages.
 
-  if ( ($opt_L && !$rules{$key}->{lang}) ||
-       ($rules{$key}->{lang} &&
-         (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i)
-     ) ) {
-    delete $rules{$key} ; next;
-  }
+=item HAM
 
-  $freq_spam{$key} = 0;
-  $freq_ham{$key} = 0;
-}
+Number of times (or percentage with B<-p>) the rule hit on
+ham messages.
 
-readlogs();
+=item S/O
 
-my $hdr_all = $num_spam + $num_ham;
-my $hdr_spam = $num_spam;
-my $hdr_ham = $num_ham;
+Shown only with B<-x> or B<-p>, this is the number of spam hits
+divided by total number of hits (C<S/O> refers to spam divided by
+overall).
 
-if ($opt_p) {
-  my $sorting = $opt_i ? "IG" : "RANK";
-  if ($opt_f) {
-    printf "%7s %7s %7s  %6s  %6s  %6s  %s\n",
-  	"OVERALL%", "FNEG%", "FPOS%", "S/O", $sorting, "SCORE", "NAME";
-  } else {
-    printf "%7s %7s  %7s  %6s  %6s  %6s  %s\n",
-  	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
-  }
-  printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
-  	$hdr_all, $hdr_spam, $hdr_ham,
-        soratio ($num_spam,$num_ham), 0, 0;
+=item RANK
 
-  $hdr_spam = ($num_spam / $hdr_all) * 100.0;
-  $hdr_ham = ($num_ham / $hdr_all) * 100.0;
-  $hdr_all = 100.0;             # this is obvious
-  printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  (all messages as %%)\n",
-  	$hdr_all, $hdr_spam, $hdr_ham,
-        soratio ($num_spam,$num_ham), 0, 0;
+Shown only with B<-x> or B<-p>, this is a measure that attempts to
+indicate how I<good> or I<useful> a test is. The higher it is, the
+better the test.
 
-} elsif ($opt_x) {
-  printf "%7s %7s  %7s  %6s  %6s %6s  %s\n",
-  	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
-  printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  (all messages)\n",
-  	$hdr_all, $hdr_spam, $hdr_ham,
-        soratio ($num_spam,$num_ham), 0, 0;
+=item SCORE
 
-} else {
-  printf "%10s  %10s  %10s  %s\n",
-  	"OVERALL", "SPAM", "HAM", "NAME";
-  printf "%10d  %10d  %10d  (all messages)\n",
-  	$hdr_all, $hdr_spam, $hdr_ham;
-}
+Shown only with B<-x> or B<-p>, this is the current score assigned to
+the rule.
 
-my %done = ();
-my @tests = ();
-my $rank_hi = 0;
-my $rank_lo = 9999999;
+=item NAME
 
-# variables for wanted/unwanted RANK
-my %wanted;
-my %unwanted;
-my %wranks;
-my %uranks;
+This is the rule's name.
 
-foreach my $test (keys %freq_spam, keys %freq_ham) {
-  next unless (exists $rules{$test});           # only valid tests
-  next if (!$opt_a && $rules{$test}->{issubrule});
+=back
 
-  next if $done{$test}; $done{$test} = 1;
-  push (@tests, $test);
+=head1 BUGS
 
-  my $isnice = 0;
-  if ($rules{$test}->{tflags} =~ /nice/) { $isnice = 1; }
+Please report bugs to http://bugzilla.spamassassin.org/
 
-  my $fs = $freq_spam{$test}; $fs ||= 0;
-  my $fn = $freq_ham{$test}; $fn ||= 0;
-  my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
-  my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;
+=head1 SEE ALSO
 
-  my $soratio = $soratio{$test} = soratio ($fsadj, $fnadj);
+L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
 
-  if ($isnice) {
-    $soratio = 1.0 - $soratio;
-    my $tmp = $fsadj; $fsadj = $fnadj; $fnadj = $tmp;
-  }
+=cut
 
-  if ($opt_i) {
-    # come up with a ranking
-    my $rank;
-
-    # New new system: from "Learning to Filter Unsolicited Commercial E-Mail",
-    # Ion Androutsopoulos et al: determine the information gain IG(X, C) of the
-    # Boolean attributes (ie. the rules). Measures "the average reduction in
-    # the entropy of C (classification) given the value of X (the rule)". Makes
-    # a good ranking measure with a proper statistical basis. ;)
-    #
-    # Still would like to get an entropy measure in, too.
-    #
-    #             sum                                    P(X = x ^ C = c)
-    # IG(X,C) = x in [0, 1]    P(X = x ^ C = c) . log2( ------------------- )
-    #           c in [Ch, Cs]                           P(X = x) . P(C = c)
-    #
-    my $safe_nspam = $num_spam || 0.0000001;
-    my $safe_nham = $num_ham || 0.0000001;
-
-    my $num_all = ($num_spam + $num_ham);
-    my $safe_all = $num_all || 0.0000001;
-    my $f_all = $fs+$fn;
-
-    my $px0 = (($num_all - $f_all) / $safe_all);         # P(X = 0)
-    my $px1 = ($f_all / $safe_all);                      # P(X = 1)
-    my $pccs = ($num_spam / $safe_all);                  # P(C = Cs)
-    my $pcch = ($num_ham / $safe_all);                   # P(C = Ch)
-    my $px1ccs = ($fs / $safe_nspam);                   # P(X = 1 ^ C = Cs)
-    my $px1cch = ($fn / $safe_nham);                    # P(X = 1 ^ C = Ch)
-    my $px0ccs = (($num_spam - $fs) / $safe_nspam);     # P(X = 0 ^ C = Cs)
-    my $px0cch = (($num_ham - $fn) / $safe_nham);       # P(X = 0 ^ C = Ch)
-    my $safe_px0_dot_pccs = ($px0 * $pccs) || 0.00000001;
-    my $safe_px0_dot_pcch = ($px0 * $pcch) || 0.00000001;
-    my $safe_px1_dot_pccs = ($px1 * $pccs) || 0.00000001;
-    my $safe_px1_dot_pcch = ($px1 * $pcch) || 0.00000001;
-
-    sub log2 { return log($_[0]) / 0.693147180559945; } # log(2) = 0.6931...
-
-    my $safe_px0ccs = ($px0ccs || 0.0000001);
-    my $safe_px0cch = ($px0cch || 0.0000001);
-    my $safe_px1ccs = ($px1ccs || 0.0000001);
-    my $safe_px1cch = ($px1cch || 0.0000001);
-    $rank = ( $px0ccs * log2($safe_px0ccs / $safe_px0_dot_pccs) ) +
-                    ( $px0cch * log2($safe_px0cch / $safe_px0_dot_pcch) ) +
-                    ( $px1ccs * log2($safe_px1ccs / $safe_px1_dot_pccs) ) +
-                    ( $px1cch * log2($safe_px1cch / $safe_px1_dot_pcch) );
-
-    $ranking{$test} = $rank;
-    $rank_hi = $rank if ($rank > $rank_hi);
-    $rank_lo = $rank if ($rank < $rank_lo);
-  }
-  else {
-    # basic wanted/unwanted ranking
-    $wanted{$test} = $isnice ? $fn : $fs;
-    $unwanted{$test} = $isnice ? $fs : $fn;
-    # count number of ranks of each type
-    $wranks{$wanted{$test}} = 1;
-    $uranks{$unwanted{$test}} = 1;
-  }
+if ($opt_L && $opt_inclang) {
+  pod2usage("-L/--language and --include-language are mutually exclusive");
 }
 
-# finish basic wanted/unwanted ranking
-if (! $opt_i) {
-  my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
-  my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;
-
-  # first half of ranking is the wanted rank
-  my $position = 0;
-  my $last = undef;
-  for my $test (@wanted) {
-    $position++ if defined $last && $last != $wanted{$test};
-    $ranking{$test} += $position;
-    $last = $wanted{$test}
-  }
-
-  # second half of ranking is the unwanted rank
-  my $normalize = (scalar keys %wranks) / (scalar keys %uranks);
-  $position = 0;
-  $last = undef;
-  for my $test (@unwanted) {
-    $position++ if defined $last && $last != $unwanted{$test};
-    $ranking{$test} += ($position * $normalize);
-    $last = $unwanted{$test};
-    $rank_hi = $ranking{$test} if ($ranking{$test} > $rank_hi);
-    $rank_lo = $ranking{$test} if ($ranking{$test} < $rank_lo);
-  }
+if ($opt_p) {
+  $opt_x = 1;
 }
 
-{
-  # now normalise the rankings to [0, 1]
-  $rank_hi -= $rank_lo;
-  foreach $test (@tests) {
-    $ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi);
-  }
-}
+$opt_s = 0 if ( !defined $opt_s );
 
-foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) {
-  next unless (exists $rules{$test});           # only valid tests
-  next if (!$opt_a && $rules{$test}->{issubrule});
+my $ok_lang = lc ( $opt_inclang || $opt_L || '');
+$ok_lang = '.' if ($ok_lang eq 'all');
 
-  my $fs = $freq_spam{$test}; $fs ||= 0;
-  my $fn = $freq_ham{$test}; $fn ||= 0;
-  my $fa = $fs+$fn;
+my $greprules = sub { # To determine whether rule should be read
+  my ($name, $rule) = @_;
 
-  next if ($opt_m && $test !~ m/$opt_m/);	# match certain tests
-  next if ($opt_t && $rules{$test}->{tflags} !~ /$opt_t/); # match tflags
+  return 0 if ($opt_m && $name !~ /$opt_m/); # name doesn't match -m
+                                             # expression
+  return 0 if ($opt_t && $rule->{tflags} !~ /$opt_t/); # tflags don't
+                                                       # match -t
+                                                       # expression
+  return 0 if (($opt_L && !$rule->{lang}) ||
+	   ($rule->{lang} &&
+	    (!$ok_lang || $rule->{lang} !~ /^$ok_lang/i))); # Wrong language
 
+  return 0 if ($rule->{issubrule} && !$opt_a);
+
   if (!$opt_a && !$opt_t) {
-    next if ($rules{$test}->{tflags} =~ /net/ && ($opt_s % 2 == 0));   # not net tests
-    next if ($rules{$test}->{tflags} =~ /userconf/); # or userconf
+    return 0 if ($rule->{tflags} =~ /net/ && ($opt_s % 2 == 0));
+    return 0 if ($rule->{tflags} =~ /userconf/); # or userconf
   }
+  return 1;
 
-  # adjust based on corpora sizes (and cvt to % while we're at it)
-  my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
-  my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;
+};
 
-  if ($opt_f && $fsadj == 0 && $fnadj == 0) { next; }
 
-  if ($opt_p) {
-    $fa = ($fa / ($num_spam + $num_ham)) * 100.0;
-    $fs = $fsadj;
-    $fn = $fnadj;
-  }
+my $logfile = $opt_l || "masses.log";
 
-  my $soratio = $soratio{$test};
-  if (!defined $soratio) {
-    $soratio{$test} = soratio ($fsadj, $fnadj);
-  }
+if (!$opt_c || !scalar(@$opt_c)) {
+    # Try to read this in from the log, if possible
+    open IN, $logfile or die "Can't open $logfile: $!";
+    my $files = 0; # are we in the files section?
+    while(<IN>) {
+	if (!$files) {
+	    if (/^\# SVN revision:/) {
+		$opt_c = [ "$FindBin::Bin/../rules" ];
+		last;
+	    } elsif (/^\# Using configuration:$/) {
+		$files = 1;
+	    }
+	} elsif (/^\#\s+(.*)\s*$/) {
+	    push (@$opt_c, $1);
+	} else {
+	    # All done!
+	    last;
+	}
+    }
 
-  if ($opt_p) {
-    printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  %s\n",
-  	$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test;
-
-  } elsif ($opt_x) {
-    printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  %s\n",
-  	$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test;
-
-  } else {
-    printf "%10d  %10d  %10d  %s\n", $fa, $fs, $fn, $test;
-  }
+    foreach my $file (@$opt_c) {
+	die "Can't read $file" unless -r $file;
+    }
 }
-exit;
+	    
+my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
+					       scoreset => $opt_s,
+                                               falsesonly => $opt_f,
+                                               greprules => $greprules,
+                                               logfile => $logfile,
+                                               nologs => 1});
 
+$masses->readrules();
+$masses->readlogs();
+$masses->do_statistics();
+$masses->do_rank();
 
+my $rules = $masses->get_rules_hash();
+my $num_ham = $masses->get_num_ham();
+my $num_spam = $masses->get_num_spam();
+my $num_all = $num_ham + $num_spam;
 
-sub readlogs {
-  my $spam = $ARGV[0] || "spam.log";
-  my $ham = $ARGV[1] || (-f "good.log" ? "good.log" : "ham.log");
+if ($num_ham + $num_spam <= 0) {
+  die "Can't run hit-frequencies on 0 messages.";
+}
 
-  foreach my $file ($spam, $ham) {
-    open (IN, "<$file") || die "Could not open file '$file': $!";
+## Write header
 
-    my $isspam = 0; ($file eq $spam) and $isspam = 1;
+if ($opt_p) {
 
-    while (<IN>) {
-      next if (/^#/);
-      next unless (!$opt_M || /$opt_M/o);
-      next if ($opt_X && /$opt_X/o);
+  if ($opt_f) {
+    printf "%7s %7s %7s  %6s  %6s  %6s  %s\n",
+  	"OVERALL%", "FNEG%", "FPOS%", "S/O", "RANK", "SCORE", "NAME";
+  } else {
+    printf "%7s %7s  %7s  %6s  %6s  %6s  %s\n",
+  	"OVERALL%", "SPAM%", "HAM%", "S/O", "RANK", "SCORE", "NAME";
+  }
 
-      /^(.)\s+(-?\d+)\s+(\S+)\s*(\S*)/ or next;
-      my $caught = ($1 eq 'Y');
-      my $hits = $2;
-      $_ = $4; s/,,+/,/g;
+  printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
+  	$num_all, $num_spam, $num_ham,
+        $num_spam / $num_all, 0, 0;
 
-      if ($isspam) {
-        if ($opt_f) {
-          if (!$caught) { $num_spam++; }
-        } else {
-          $num_spam++;
-        }
-      } else {
-        if ($opt_f) {
-          if ($caught) { $num_ham++; }
-        } else {
-          $num_ham++;
-        }
-      }
+  printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  (all messages as %%)\n",
+  	100.0, $num_spam / $num_all * 100.0, $num_ham / $num_all * 100.0,
+        $num_spam / $num_all, 0, 0;
 
-      my @tests = split (/,/, $_);
-      foreach my $t (@tests) {
-	next if ($t eq '');
-	if ($isspam) {
-          if ($opt_f) {
-            if (!$caught) { $freq_spam{$t}++; }
-          } else {
-            $freq_spam{$t}++;
-          }
-	} else {
-          if ($opt_f) {
-            if ($caught) { $freq_ham{$t}++; }
-          } else {
-            $freq_ham{$t}++;
-          }
-	}
-      }
-    }
-    close IN;
-  }
-}
+} elsif ($opt_x) {
+  printf "%7s %7s  %7s  %6s  %6s %6s  %s\n",
+  	"OVERALL", "SPAM", "HAM", "S/O", "RANK", "SCORE", "NAME";
+  printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  (all messages)\n",
+  	$num_all, $num_spam, $num_ham,
+        $num_spam / $num_all, 0, 0;
 
-
-sub readscores {
-  my($cffile) = @_;
-  system ("$FindBin::Bin/parse-rules-for-masses -d \"$cffile\" -s $opt_s") and die;
-  require "./tmp/rules.pl";
+} else {
+  printf "%10s  %10s  %10s  %s\n",
+  	"OVERALL", "SPAM", "HAM", "NAME";
+  printf "%10d  %10d  %10d  (all messages)\n",
+  	$num_all, $num_spam, $num_ham;
 }
 
-sub soratio {
-  my ($s, $n) = @_;
+foreach my $test (sort { $rules->{$b}->{rank} <=> $rules->{$a}->{rank} } keys %{$rules}) {
 
-  $s ||= 0;
-  $n ||= 0;
-
-  if ($s + $n > 0) {
-      return $s / ($s + $n);
+  if ($opt_p) {
+    printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  %s\n",
+  	($rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham}) / $num_all * 100.0,
+        $rules->{$test}->{spam_percent}, $rules->{$test}->{ham_percent},
+        $rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test;
+  } elsif ($opt_x) {
+    printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  %s\n",
+  	$rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham},
+        $rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham},
+        $rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test;
   } else {
-      return 0.5;		# no results -> not effective
+    printf "%10d  %10d  %10d  %s\n",
+        $rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham},
+        $rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham}, $test;
   }
 }
 
-sub tcr {
-  my ($nspam, $nlegit, $nspamspam, $nlegitspam) = @_;
-  my $nspamlegit = $nspam - $nspamspam;
-  my $nlegitlegit = $nlegit - $nlegitspam;
-
-  my $lambda = 99;
-
-  my $werr = ($lambda * $nlegitspam + $nspamlegit)
-                  / ($lambda * $nlegit + $nspam);
-
-  my $werr_base = $nspam
-                  / ($lambda * $nlegit + $nspam);
-
-  $werr ||= 0.000001;     # avoid / by 0
-  my $tcr = $werr_base / $werr;
-  return $tcr;
-}
Index: rewrite-cf-with-new-scores
===================================================================
--- rewrite-cf-with-new-scores	(revision 10703)
+++ rewrite-cf-with-new-scores	(working copy)
@@ -16,32 +16,119 @@
 # limitations under the License.
 # </@LICENSE>
 
+=head1 NAME
+
+rewrite-cf-with-new-scores - Rewrite SpamAssassin scores file with new
+scores.
+
+=head1 SYNOPSIS
+
+rewrite-cf-with-new-scores [options]
+
+  Options
+  --old-scores=file    Read file containing the old SpamAssassin scores
+  --new-scores=file    Read file containing the new SpamAssassin scores
+  -s,--scoreset n      Rewrite scoreset n
+  --output=file        Output rewritten score file to file
+  -c,--cffile=path     Use path as the rules directory
+  -l,--logfile=file    Use file instead of masses.log (for guessing -c)
+
+ Note: these options can be shortened (i.e. --old, --new, --out) as
+ long as they are unambiguous.
+
+=head1 DESCRIPTION
+
+B<rewrite-cf-with-new-scores> is a tool to update the sitewide scores
+file with the newly generated scores. Since SpamAssassin has four
+different scoresets, which each need to be generated separately, this
+tool is used to only change the correct scoreset.
+
+By default, the old scores are read from 50_scores.cf in the rules
+directory and the new ones from ./perceptron.scores. The output will
+be ./50_scores.cf by default.
+
+The rules directory needs to be used to make sure scores are given for
+the right tests. Rules not found in the rules directory will not be
+given scores in the output.
+
+=head1 BUGS
+
+Please report bugs to http://bugzilla.spamassassin.org/
+
+=head1 SEE ALSO
+
+L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
+
+=cut
+
+use FindBin;
+use lib "$FindBin::Bin/../lib";
+use Getopt::Long qw(:config bundling auto_help);
+use Mail::SpamAssassin::Masses;
+use Pod::Usage;
+use strict;
+use warnings;
+
+use vars qw($opt_old $opt_new $opt_scoreset $opt_out $opt_c $opt_l);
+
+GetOptions("old-scores=s" => \$opt_old,
+	   "new-scores=s" => \$opt_new,
+	   "s|scoreset=i" => \$opt_scoreset,
+	   "output=s" => \$opt_out,
+	   "c|cffile=s@" => \$opt_c,
+	   "l|logfile=s" => \$opt_l);
+
+$opt_l ||= "masses.log";
+$opt_scoreset = 0 unless defined $opt_scoreset;
+
 my $NUM_SCORESETS = 4;
 
-my ($scoreset,$oldscores,$newscores) = @ARGV;
+if (!$opt_c || !scalar(@$opt_c)) {
+    # Try to read this in from the log, if possible
+    open IN, $opt_l or die "Can't open $opt_l: $!";
+    my $files = 0; # are we in the files section?
+    while(<IN>) {
+	if (!$files) {
+	    if (/^\# SVN revision:/) {
+		$opt_c = [ "$FindBin::Bin/../rules" ];
+		last;
+	    } elsif (/^\# Using configuration:$/) {
+		$files = 1;
+	    }
+	} elsif (/^\#\s+(.*)\s*$/) {
+	    push (@$opt_c, $1);
+	} else {
+	    # All done!
+	    last;
+	}
+    }
 
-$scoreset = int($scoreset) if defined $scoreset;
-if (!defined $newscores || $scoreset < 0 || $scoreset >= $NUM_SCORESETS ) {
-  die "usage: rewrite-cf-with-new-scores scoreset oldscores.cf newscores.cf\n";
+    foreach my $file (@$opt_c) {
+	die "Can't read $file" unless -r $file;
+    }
 }
 
-system ("./parse-rules-for-masses -s $scoreset") and die;
-if (-e "tmp/rules.pl") {
-  # Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem
-  require "./tmp/rules.pl";
+if (!$opt_old) {
+  $opt_old = $$opt_c[0] . "/50_scores.cf";
 }
-else {
-  die "parse-rules-for-masses had no error but no tmp/rules.pl!?!";
-}
 
+$opt_new ||= "50_scores.cf";
+
+my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
+					       scoreset => $opt_scoreset});
+
+$masses->readrules();
+my $rules = $masses->get_rules_hash();
+
 # now read the generated scores
 my @gascoreorder = ();
+my %oldscores = ();
 my %gascorelines = ();
-open (STDIN, "<$newscores") or die "cannot open $newscores";
+open (STDIN, "<$opt_new") or die "cannot open $opt_new";
 while (<STDIN>) {
   /^score\s+(\S+)\s+(-?\d+(?:\.\d+)?)/ or next;
   my $name = $1;  my $score = $2;
-  next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);
+  next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule});
   next if ($name =~ /^__/);
   next if ($name eq '(null)');	# er, oops ;)
 
@@ -49,7 +136,7 @@
   push (@gascoreorder, $name);
 }
 
-open (IN, "<$oldscores") or die "cannot open $oldscores";
+open (IN, "<$opt_old") or die "cannot open $opt_old";
 my $out = '';
 my $pre = '';
 
@@ -58,7 +145,7 @@
 while (<IN>) {
   if (/^\s*score\s+(\S+)\s/) {
     delete $gascorelines{$1};
-    next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);
+    next unless (exists ($rules->{$1}) && $rules->{$1}->{issubrule} == 0);
   }
   $pre .= $_;
   /^# Start of generated scores/ and last;
@@ -82,10 +169,10 @@
   if (/^\s*score\s+\S+/) {
     my($score,$name,@scores) = split;
 
-    next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);
+    next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule});
     if (defined $gascorelines{$name}) {
       # Set appropriate scoreset value
-      $scores[$scoreset] = $gascorelines{$name};
+      $scores[$opt_scoreset] = $gascorelines{$name};
 
       # Create new score line
       $_ = join(" ","score",$name,generate_scores(@scores))."\n";
@@ -96,8 +183,10 @@
 }
 close IN;
 
+open OUT, ">$opt_out" or die "Can't open $opt_out: $!";
+
 # and output the lot
-print $pre, "\n";
+print OUT $pre, "\n";
 foreach my $name (@gascoreorder) {
   $_ = $gascorelines{$name};
   next unless (defined ($_));
@@ -107,12 +196,12 @@
   @scores = @{$oldscores{$name}} if ( exists $oldscores{$name} );
 
   # Set appropriate scoreset value
-  $scores[$scoreset] = $_;
+  $scores[$opt_scoreset] = $_;
 
   # Create new score line
-  print join(" ","score",$name,generate_scores(@scores)),"\n";
+  print OUT join(" ","score",$name,generate_scores(@scores)),"\n";
 }
-print "\n", $out, "\n";
+print OUT "\n", $out, "\n";
 
 sub generate_scores {
   my (@scores) = @_;
Index: Makefile
===================================================================
--- Makefile	(revision 10703)
+++ Makefile	(working copy)
@@ -4,33 +4,26 @@
 
 # What rule scoreset are we using?
 SCORESET =	0
+LOGFILE =	masses.log
 
 #### Should be no need to modify below this line
 
 all: badrules perceptron
 
 perceptron: perceptron.o
-	$(CC) -o perceptron perceptron.o $(LDFLAGS)
+	$(CC) -o perceptron perceptron.o $(LDFLAGS) 
 
-perceptron.o: tmp/rules.pl tmp/tests.h tmp/scores.h
+perceptron.o: tmp/tests.h
 	$(CC) $(CFLAGS) -c -o perceptron.o perceptron.c
 
-tmp/rules.pl: tmp/.created parse-rules-for-masses
-	perl parse-rules-for-masses -d ../rules -s $(SCORESET)
+tmp/tests.h: tmp/.created logs-to-c
+	perl logs-to-c --scoreset=$(SCORESET) --logfile=$(LOGFILE)
 
-tmp/tests.h: tmp/.created tmp/ranges.data logs-to-c
-	perl logs-to-c --scoreset=$(SCORESET)
+freqs: masses.log
+	perl hit-frequencies -x -p -s $(SCORESET) --logfile=$(LOGFILE) > freqs
 
-tmp/scores.h: tmp/tests.h
-
-tmp/ranges.data: tmp/.created freqs score-ranges-from-freqs
-	perl score-ranges-from-freqs ../rules $(SCORESET) < freqs
-
-freqs: spam.log ham.log
-	perl hit-frequencies -x -p -s $(SCORESET) > freqs
-
 badrules: freqs
-	perl lint-rules-from-freqs < freqs > badrules
+	perl lint-rules-from-freqs -s $(SCORESET) --logfile=$(LOGFILE) > badrules
 
 tmp/.created:
 	-mkdir tmp
Index: lint-rules-from-freqs
===================================================================
--- lint-rules-from-freqs	(revision 10703)
+++ lint-rules-from-freqs	(working copy)
@@ -16,124 +16,226 @@
 # limitations under the License.
 # </@LICENSE>
 
+=head1 NAME
+
+lint-rules-from-freqs - Try to find problems with SpamAssassin rules
+
+=head1 SYNOPSIS
+
+lint-rules-from-freqs [options]
+
+ Options:
+    -c,--cffile=path	  Use path as the rules directory
+    -s,--scoreset=n	  Use scoreset n
+    -l,--logfile=file	  Read in file instead of masses.log
+    -f			  Also take into account false positives/negatives
+
+=head1 DESCRIPTION
+
+This script analyzes SpamAssassin tests, based on the hit frequencies
+and S/O ratios from a mass-check log (masses.log).  This script can
+also optionally take into account the false positive/negative
+frequencies.
+
+The script first uses the SpamAssassin rules parser to report on any
+illegal syntax. Then it checks the rules match frequencies from the
+mass-check log in order to determine how effective the rule is.
+
+=head1 BUGS
+
+Please report bugs to http://bugzilla.spamassassin.org/
+
+=head1 SEE ALSO
+
+L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
+
+=cut
+
+
+use FindBin;
+use lib "$FindBin::Bin/../lib";
+use Mail::SpamAssassin::Masses;
+use Mail::SpamAssassin;
+use Getopt::Long qw(:config bundling auto_help);
+use strict;
+use warnings;
+
 # any tests that get less than this % of matches on *both* spam or nonspam, are
 # reported.
 my $LOW_MATCHES_PERCENT = 0.03;
-my $scoreset = 0;
 
+use vars qw($opt_c $opt_l $opt_s $opt_f $opt_p);
+
+GetOptions("c|cffile=s@" => \$opt_c,
+	   "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
+	   "l|logfile=s" => \$opt_l,
+	   "f|falses" => \$opt_f);
+
+
+$opt_s = 0 unless defined $opt_s;
+$opt_l ||= "masses.log";
+
 sub usage {
   die "
 lint-rules-from-freqs: perform 'lint' testing on SpamAssassin rules and scores
 
-usage: ./lint-rules-from-freqs [-f falsefreqs] < freqs > badtests
+usage: ./lint-rules-from-freqs [-c rules dir] [-l logfile] [-s scoreset] [-f]
 
-This analyzes SpamAssassin tests, based on the hit frequencies and S/O ratios
-from a mass-check logfile pair.
+ -c p  Use p as rules directory
+ -l f  Use f as log fine
+ -s n  Use n as score set
+ -f    Check also for false positives/negatives
 
-The 'freqs' argument is the frequency of hits in all messages ('hit-frequencies
--x -p' output).
 
-The 'falsefreqs' argument is frequencies of hits in false-positives and
-false-negatives only ('hit-frequencies -x -p -f' output).
-
 ";
 }
 
-my $opt_falsefreqs;
-while ($#ARGV >= 0) {
-  $_ = shift @ARGV;
-  if (/^-f/) { $_ = shift @ARGV; $opt_falsefreqs = $_; }
-  elsif (/^-s/) { $_ = shift @ARGV; $scoreset = $_; }
-  else { usage(); }
+
+if (!$opt_c || !scalar(@$opt_c)) {
+    # Try to read this in from the log, if possible
+    open IN, $opt_l or die "Can't open $opt_l: $!";
+    my $files = 0; # are we in the files section?
+    while(<IN>) {
+	if (!$files) {
+	    if (/^\# SVN revision:/) {
+		$opt_c = [ "$FindBin::Bin/../rules" ];
+		last;
+	    } elsif (/^\# Using configuration:$/) {
+		$files = 1;
+	    }
+	} elsif (/^\#\s+(.*)\s*$/) {
+	    push (@$opt_c, $1);
+	} else {
+	    # All done!
+	    last;
+	}
+    }
+
+    foreach my $file (@$opt_c) {
+	die "Can't read $file" unless -r $file;
+    }
 }
 
 print "BAD TESTS REPORT\n";
-readrules();
-print "\n" .((scalar keys %rulefile) + 1). " rules found.\n";
+# First, do a --lint
+
 print "\nRule file syntax issues:\n\n";
-lintrules();
 
-if ($opt_falsefreqs) {
-  open (FALSE, "<$opt_falsefreqs");
-  while (<FALSE>) {
-    if (!/^\s*([\d\.]+)/) {
-      my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
-      next unless ($name =~ /\S/);
-      $falsefreqs_spam{$name} = $spam;
-      $falsefreqs_nons{$name} = $nons;
-      $falsefreqs_so{$name} = $so;
+{
+  local (*STDERR) = \*STDOUT; # Get lint errors on STDOUT
+
+  # Read the config ourselves...
+
+  # Read init.pre from each directory, then glob for the rest.
+
+  my $cf_txt = '';
+  my @files;
+  my @dirs;
+  foreach my $file (@$opt_c) {
+    if (-d $file) {
+      if  (-r "$file/init.pre") {
+	push @files, "$file/init.pre";
+      }
+      push @dirs, $file;
     }
+    else {
+      push @files, $file;
+    }
   }
-  close FALSE;
-}
+  foreach my $dir (@dirs) {
+    my @cfs = glob("$dir/*.cf");
+    push @files, grep { -r $_ } @cfs;
+  }
 
-while (<>) {
-  if (!/^\s*([\d\.]+)/) {
-    $output{'a_header'} = $_; next;
+  foreach my $file (@files) {
+    if (-r $file) {
+      open IN, $file;
+      $cf_txt .= "file start $file\n";
+      $cf_txt .= join('', <IN>);
+      $cf_txt .= "\nfile end $file\n";
+      close IN;
+    }
   }
 
+  my $spamtest = new Mail::SpamAssassin({config_text => $cf_txt});
+
+  $spamtest->lint_rules();
+}
+
+
+# Next, check for other stuff
+my $masses = Mail::SpamAssassin::Masses->new({rulesdir => $opt_c,
+					      scoreset => $opt_s, #,,
+					      falses => $opt_f,
+					      logfile => $opt_l});
+
+$masses->readlogs();
+$masses->do_statistics();
+
+my $rules = $masses->get_rules_array();
+
+
+my %output;
+
+foreach my $rule (@$rules) {
+
   my $badrule;
-  my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
-  next unless ($name =~ /\S/);
 
-  my $ffspam = $falsefreqs_spam{$name};
-  my $ffnons = $falsefreqs_nons{$name};
-  my $ffso = $falsefreqs_so{$name};
+  next if ($rule->{tflags} =~ /\bnet\b/ && ($opt_s % 2) == 0);
+  next if ($rule->{tflags} =~ /\buserconf\b/);
 
-  my $tf = $tflags{$name};
-  next if ($tf =~ /net/ && ($scoreset % 2) == 0);
-  next if ($tf =~ /userconf/);
+  if ($rule->{freq_spam} == 0 && $rule->{freq_ham} == 0) {        # sanity!
 
-  if ($overall == 0.0 && $spam == 0.0 && $nons == 0.0) {        # sanity!
     $badrule = 'no matches';
 
   } else {
-    if ($score < 0.0) {
+    if ($rule->{score} < 0.0) {
       # negative score with more spams than nonspams? bad rule.
-      if ($tf !~ /nice/ && $so > 0.5 && $score < 0.5) {
+      if (!$rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) {
         $badrule = 'non-nice but -ve score';
       }
-
-      if ($tf =~ /nice/ && $so > 0.5 && $score < 0.5) {
-        if ($ffso < 0.5) {
+      if ($rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) {
+        if ($opt_f && $rule->{freq_fn} < $rule->{freq_fp}) {
           $badrule = 'fn';
-        } else {
-          # ignore, the FNs are overridden by other tests so it doesn't
-          # affect the overall results.
         }
+        # else {
+        # ignore, the FNs are overridden by other tests so it doesn't
+        # affect the overall results.
+        # }
       }
 
       # low number of matches overall
-      if ($nons < $LOW_MATCHES_PERCENT) 
+      if ($rule->{ham_percent} < $LOW_MATCHES_PERCENT)
                  { $badrule ||= ''; $badrule .= ', low matches'; }
 
-    } elsif ($score > 0.0) {
+    } elsif ($rule->{score} > 0.0) {
       # positive score with more nonspams than spams? bad.
-      if ($tf =~ /nice/ && $so < 0.5 && $score > 0.5) {
+      if ($rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) {
         $badrule = 'nice but +ve score';
       }
-
-      if ($tf !~ /nice/ && $so < 0.5 && $score > 0.5) {
-        if ($ffso > 0.5) {
+ 
+      if (!$rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) {
+        if ($opt_f && $rule->{freq_fp} > $rule->{freq_fn}) {
           $badrule = 'fp';
-        } else {
-          # ignore, the FPs are overridden by other tests so it doesn't
-          # affect the overall results.
         }
+        # else {
+        # ignore, the FPs are overridden by other tests so it doesn't
+        # affect the overall results.
+        # }
       }
-
+ 
       # low number of matches overall
-      if ($spam < $LOW_MATCHES_PERCENT) 
+      if ($rule->{spam_percent} < $LOW_MATCHES_PERCENT)
                  { $badrule ||= ''; $badrule .= ', low matches'; }
-
-    } elsif ($score == 0.0) {
+ 
+    } elsif ($rule->{score} == 0.0) {
       $badrule = 'score is 0';
     }
   }
-
+ 
   if (defined $badrule) {
-    $badrule =~ s/^, //; chomp;
-    $output{$badrule} .= $_ . " ($badrule)\n";
+    $badrule =~ s/^, //;
+    $output{$badrule} .= $rule->{name} . " ($badrule)\n";
   }
 }
 
@@ -156,182 +258,3 @@
 exit;
 
 
-sub concat_rule_lang {
-  my $rule = shift;
-  my $lang = shift;
-
-  if (defined $lang && $lang ne '') {
-    return "[$lang]_$rule";
-  } else {
-    return $rule;
-  }
-}
-
-# note: do not use parse-rules-for-masses here, we need to do linting instead
-# of your average parse
-sub readrules {
-  my @files = <../rules/[0-9]*.cf>;
-  my $file;
-  %rulesfound = ();
-  %langs = ();
-  foreach $file (@files) {
-    open (IN, "<$file");
-    while (<IN>) {
-      s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;
-
-      # make all the foo-bar stuff foo_bar
-      1 while s/^(\S+)-/\1_/g;
-      1 while s/^(lang\s+\S+\s+\S+)-/\1_/g;
-
-      my $lang = '';
-      if (s/^lang\s+(\S+)\s+//) {
-        $lang = $1; $langs{$1} = 1;
-      }
-
-      if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) {
-        $rulesfound{$2} = 1;
-        $rulefile{$2} ||= $file;
-        $scorefile{$1} = $file;
-        $score{$2} ||= 1.0;
-        $tflags{$2} ||= '';
-        $descfile{$2} ||= $file;       # a rule with no score or desc is OK
-	$description{$2}->{$lang} = undef;
-
-        if (/^body\s+\S+\s+eval:/) {
-          # ignored
-        } elsif (/^body\s+\S+\s+(.*)$/) {
-          my $re = $1;
-
-	  # If there's a ( in a rule where it should be (?:, flag it.
-	  # but ignore [abc(] ...
-          if ($re =~ /[^\\]\([^\?]/ && $re !~ /\[[^\]]*[^\\]\(/) { 
-            print "warning: non-(?:...) capture in regexp in $file: $_\n";
-          }
-          if ($re =~ /\.[\*\+]/) { 
-            print "warning: .* in regexp in $file: $_\n";
-          }
-          if ($re =~ /[^\\]\{(\d*),?(\d*?)\}/) {
-            if ($1 > 120 || $2 > 120) {
-              print "warning: long .{n} in regexp in $file: $_\n";
-            }
-          }
-        }
-
-      } elsif (/^describe\s+(\S+)\s+(.*?)\s*$/) {
-        $rulesfound{$1} = 1;
-        $descfile{concat_rule_lang ($1, $lang)} ||= $file;
-        $descfile{$1} ||= $file;
-	$description{$1}->{$lang} = $2;
-      } elsif (/^tflags\s+(\S+)\s+(.+)$/) {
-        $rulesfound{$1} = 1;
-        $tflags{$1} = $2;
-        $tflagsfile{concat_rule_lang ($1, $lang)} = $file;
-        $tflagsfile{$1} = $file;
-      } elsif (/^score\s+(\S+)\s+(.+)$/) {
-        $rulesfound{$1} = 1;
-        $scorefile{concat_rule_lang ($1, $lang)} = $file;
-        $scorefile{$1} = $file;
-        $score{$1} = $2;
-      } elsif (/^(clear_report_template|clear_spamtrap_template|report|spamtrap|
-                clear_terse_report_template|terse_report|
-                required_score|ok_locales|ok_languages|test|lang|
-                spamphrase|whitelist_from|require_version|
-		clear_unsafe_report_template|unsafe_report|
-		(?:bayes_)?auto_learn_threshold_nonspam|(?:bayes_)?auto_learn_threshold_spam|
-		(?:bayes_)?auto_learn
-                )/x) {
-        next;
-      } else {
-        print "warning: unknown rule in $file: $_\n";
-      }
-    }
-    close IN;
-  }
-  @langsfound = sort keys %langs;
-  @rulesfound = sort keys %rulesfound;
-}
-
-sub lintrules {
-  my %possible_renames = ();
-
-  foreach my $rule (@rulesfound) {
-    my $match = $rule;
-    $match =~ s/_\d+[^_]+$//gs;    # trim e.g. "_20K"
-    $match =~ s/[^A-Z]+//gs;    # trim numbers etc.
-
-    if (defined ($rulefile{$rule}) && $possible_renames{$match} !~ / \Q$rule\E\b/) {
-      $possible_renames{$match} .= " ".$rule;
-    }
-    $possible_rename_matches{$rule} = $match;
-  }
-
-  foreach my $lang ('', @langsfound) {
-    foreach my $baserule (@rulesfound) {
-      next if ( $baserule =~ /^__/ || $baserule =~ /^T_/ );
-
-      my $rule = concat_rule_lang ($baserule, $lang);
-      my $f = $descfile{$rule};
-      my $warned = '';
-
-      if (defined $f && !defined ($rulefile{$rule})
-                && !defined ($rulefile{$baserule}))
-      {
-        print "warning: $baserule has description, but no rule: $f\n";
-        $warned .= ' lamedesc';
-      }
-
-	# Check our convention for rule length
-	if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && length $baserule > 22 ) {
-	  print "warning: $baserule has a name longer than 22 chars: $f\n";
-	}
- 	# Check our convention for rule length
-	if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && defined $description{$baserule}->{$lang} && length $description{$baserule}->{$lang} > 50 ) {
-	  print "warning: $baserule has a description longer than 50 chars: $f\n";
-	}
-
-      # lang rule trumps normal rule
-      $f = $rulefile{$rule} || $rulefile{$baserule};
-      # if the rule exists, and the language/rule description doesn't exist ...
-      if ( defined $f && !defined $description{$baserule}->{$lang} )
-      {
-        print "warning: $baserule exists, ",( $lang ne '' ? "lang $lang, " : "" ),"but has no description: $f\n";
-        $warned .= ' lamedesc';
-      }
-
-
-      $f = $scorefile{$rule};
-      if (defined $f && !defined ($rulefile{$rule})
-                && !defined ($rulefile{$baserule}))
-      {
-        print "warning: $baserule has score, but no rule: $f\n";
-        $warned .= ' lamescore';
-      }
-
-      my $r = $possible_rename_matches{$rule};
-      if ($warned ne '' && defined $r) {
-        my @matches = split (' ', $possible_renames{$r});
-        if (scalar @matches != 0) {
-          my $text = '';
-
-          # now try and figure out "nearby" rules with no description/score
-          foreach my $baser (@matches) {
-            my $blang;
-            if ($descfile{$rule} =~ /text_(\S\S)\./) {
-              $blang = $1;
-            }
-            my $r = concat_rule_lang ($baser, $blang);
-            #warn "$r $descfile{$r} $descfile{$baser}";
-            next if ($warned =~ /lamedesc/ && (defined $descfile{$r}));
-            next if ($warned =~ /lamescore/ && (defined $scorefile{$r}));
-            $text .= " $baser";
-          }
-
-          if ($text ne '') {
-            print "warning: (possible renamed rule? $text)\n";
-          }
-        }
-      }
-    }
-  }
-}
-
Index: mass-check
===================================================================
--- mass-check	(revision 10703)
+++ mass-check	(working copy)
@@ -16,97 +16,151 @@
 # limitations under the License.
 # </@LICENSE>
 
-sub usage {
-  die <<ENDOFUSAGE;
-usage: mass-check [options] target ...
- 
-  -c=file       set configuration/rules directory
-  -p=dir        set user-prefs directory
-  -f=file       read list of targets from <file>
-  -j=jobs       specify the number of processes to run simultaneously
-  --net         turn on network checks!
-  --mid         report Message-ID from each message
-  --debug       report debugging information
-  --progress    show progress updates during check
-  --rewrite=OUT save rewritten message to OUT (default is /tmp/out)
-  --showdots    print a dot for each scanned message
-  --rules=RE    Only test rules matching the given regexp RE
-  --restart=N   restart all of the children after processing N messages
-  --deencap=RE  Extract SpamAssassin-encapsulated spam mails only if they
-                were encapsulated by servers matching the regexp RE
-                (default = extract all SpamAssassin-encapsulated mails)
- 
-  log options
-  -o            write all logs to stdout
-  --loghits     log the text hit for patterns (useful for debugging)
-  --loguris	log the URIs found
-  --hamlog=log  use <log> as ham log ('ham.log' is default)
-  --spamlog=log use <log> as spam log ('spam.log' is default)
- 
-  message selection options
-  -n            no date sorting or spam/ham interleaving
-  --after=N     only test mails received after time_t N (negative values
-                are an offset from current time, e.g. -86400 = last day)
-                or after date as parsed by Time::ParseDate (e.g. '-6 months')
-  --before=N    same as --after, except received times are before time_t N
-  --all         don't skip big messages
-  --head=N      only check first N ham and N spam (N messages if -n used)
-  --tail=N      only check last N ham and N spam (N messages if -n used)
- 
-  simple target options (implies -o and no ham/spam classification)
-  --dir         subsequent targets are directories
-  --file        subsequent targets are files in RFC 822 format
-  --mbox        subsequent targets are mbox files
-  --mbx         subsequent targets are mbx files
- 
-  Just left over functions we should remove at some point:
-  --bayes       report score from Bayesian classifier
- 
-  non-option arguments are used as target names (mail files and folders),
-  the target format is: <class>:<format>:<location>
-  <class>       is "spam" or "ham"
-  <format>      is "dir", "file", "mbx", or "mbox"
-  <location>    is a file or directory name.  globbing of ~ and * is supported
+=head1 NAME
 
-ENDOFUSAGE
-}
+mass-check - Generates SpamAssassin scores and results for large
+amounts of mail
 
+=head1 SYNOPSIS
+
+ mass-check [options] class:format:location ...
+ mass-check [options] {--dir | --file | --mbox} target ...
+ mass-check [options] -f file
+
+  Options:
+    -f=file       read list of targets from <file>
+    -j=jobs       specify the number of processes to run simultaneously
+    --net         turn on network checks!
+    --mid         report Message-ID from each message
+    --debug       report debugging information
+    --progress    show progress updates during check
+    --rewrite=OUT save rewritten message to OUT (default is /tmp/out)
+    --showdots    print a dot for each scanned message
+    --rules=RE    Only test rules matching the given regexp RE
+    --restart=N   restart all of the children after processing N messages
+
+    SpamAssassin options
+    -c=dir        set configuration/rules directory
+    -p=file       set user preferences file (default: none)
+    -s=dir        set site rules configuration directory
+    -u=dir        set user-state directory
+    --dist        assumes the script is being run from the masses/ dir of
+                  the unpacked tarball, and makes appropriate guesses for
+                  -p and -c
+    --deencap=RE  Extract SpamAssassin-encapsulated spam mails only if they
+                  were encapsulated by servers matching the regexp RE
+                  (default = extract all SpamAssassin-encapsulated mails)
+
+    log options
+    -o            write all logs to stdout
+    --loghits     log the text hit for patterns (useful for debugging)
+    --loguris	  log the URIs found
+    --log=file    log to <file> (masses.log is default)
+
+    message selection options
+    -n            no date sorting or spam/ham interleaving
+    --after=N     only test mails received after time_t N (negative values
+                  are an offset from current time, e.g. -86400 = last day)
+                  or after date as parsed by Time::ParseDate (e.g. '-6 months')
+    --before=N    same as --after, except received times are before time_t N
+    --all         don't skip big messages
+    --head=N      only check first N ham and N spam (N messages if -n used)
+    --tail=N      only check last N ham and N spam (N messages if -n used)
+
+    simple target options (implies -o and no ham/spam classification)
+    --dir         subsequent targets are directories
+    --file        subsequent targets are files in RFC 822 format
+    --mbox        subsequent targets are mbox files
+    --mbx         subsequent targets are mbx files
+
+    Just left over functions we should remove at some point:
+    --bayes       report score from Bayesian classifier
+    --hamlog=log  use <log> as ham log ('ham.log' is default)
+    --spamlog=log use <log> as spam log ('spam.log' is default)
+
+=head1 DESCRIPTION
+
+B<mass-check> is designed to assist with rule development and
+generation of SpamAssassin scored. It reads in mail from the
+location(s) specified on the command line (in the first form above),
+given in the form I<class:format:location>, where I<class> is either
+"spam" or "ham" (non-spam), I<format> is one of "dir" (Maildirs, MH,
+etc), "file", "mbox" (mboxes can be gzipped) or "mbx".
+
+B<mass-check> will analyze each message using SpamAssassin and
+generate one-line of output per message, (by default to masses.log) in
+the following format:
+
+ {s|h} {s|h} score filename tests-hit
+
+The first field is the message's class as given on the command line
+(ham or spam). The second is the message's class as determined by
+SpamAssassin. The third is the message's score, as determined by
+SpamAssassin. The fourth field contains the message's filename; for
+mboxes, this contains the filename and the byte offset from the
+beginning of the file separated by a period. The last field contains a
+list of all the tests the message hit separated by commas.
+
+If you want to run this on the currently installed version of
+SpamAssassin's rules for sitewide use, make sure your user_prefs file
+contains no rules.
+
+=head1 BUGS
+
+Please report bugs to http://bugzilla.spamassassin.org/
+
+=head1 SEE ALSO
+
+L<hit-frequencies(1)>, L<logs-to-c(1)>, L<Mail::SpamAssassin::Masses(3)>,
+L<perceptron(1)>
+
+=cut
+
 ###########################################################################
 
-use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all $opt_bayes
-	    $opt_debug $opt_format $opt_hamlog $opt_head $opt_loghits
-	    $opt_mid $opt_mh $opt_ms $opt_net $opt_nosort $opt_progress
-	    $opt_showdots $opt_spamlog $opt_tail $opt_rules $opt_restart
-	    $opt_loguris $opt_after $opt_before $opt_rewrite $opt_deencap);
+use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all
+	    $opt_bayes $opt_before $opt_debug $opt_dist $opt_format
+	    $opt_hamlog $opt_head $opt_log $opt_loghits $opt_mid
+	    $opt_mh $opt_ms $opt_net $opt_nosort $opt_p $opt_progress
+	    $opt_s $opt_showdots $opt_spamlog $opt_tail $opt_rules
+	    $opt_restart $opt_loguris $opt_after $opt_rewrite $opt_u
+	    $opt_deencap);
 
 use FindBin;
 use lib "$FindBin::Bin/../lib";
 eval "use bytes";
 use Mail::SpamAssassin::ArchiveIterator;
 use Mail::SpamAssassin;
-use Getopt::Long;
+use Getopt::Long qw(:config bundling auto_help);
 use POSIX qw(strftime);
 use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };
+use strict; # Why wasn't this on?
 use Config;
 
 # default settings
-$opt_c = "$FindBin::Bin/../rules";
-$opt_p = "$FindBin::Bin/spamassassin";
+
 $opt_j = 1;
 $opt_net = 0;
-$opt_hamlog = "ham.log";
-$opt_spamlog = "spam.log";
+$opt_log = "masses.log";
 
-GetOptions("c=s", "p=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug",
-	   "hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net",
-	   "progress", "rewrite:s", "showdots", "spamlog=s", "tail=i",
-	   "rules=s", "restart=i", "after=s", "before=s", "loguris", "deencap=s",
+GetOptions("c=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug",
+	   "deencap=s", "dist!", "hamlog=s", "head=i", "log=s",
+	   "loghits", "mh", "mid", "ms", "net", "p=s", "progress",
+	   "rewrite:s", "s=s", "showdots", "spamlog=s", "tail=i",
+	   "rules=s", "restart=i", "u=s", "after=s", "loguris",
 	   "dir" => sub { $opt_format = "dir"; },
-	   "file" => sub { $opt_format = "file"; },
+	   "file" => sub {$opt_format = "file"; },
 	   "mbox" => sub { $opt_format = "mbox"; },
 	   "mbx" => sub { $opt_format = "mbx"; },
 	   '<>' => \&target) or usage();
 
+if ($opt_hamlog || $opt_spamlog) { # Old style logging
+  $opt_hamlog ||= "ham.log";
+  $opt_spamlog ||= "spam.log";
+}
+
+my @targets;
+
 if ($opt_f) {
   open(F, $opt_f) || die $!;
   push(@targets, map { chomp; $_ } <F>);
@@ -115,45 +169,89 @@
 
 if (scalar @targets == 0) { usage(); }
 
-#if ($opt_ms) {
-#find_missed($opt_spamlog);
-#}
-#elsif ($opt_mh) {
-#find_missed($opt_hamlog);
-#}
+# Auto-detect --dist option
+if (!defined $opt_dist) {
+  if (-f "$FindBin::Bin/../spamassassin.raw") {
+    warn "Automatically using --dist. Assuming you are running from the unpacked tarball. Use --no-dist to override.";
+    $opt_dist = 1;
+  }
+}
 
-$spamtest = new Mail::SpamAssassin ({
-  'debug'              			=> $opt_debug,
-  'rules_filename'     			=> $opt_c,
-  'userprefs_filename' 			=> "$opt_p/user_prefs",
-  'site_rules_filename'			=> "$opt_p/local.cf",
-  'userstate_dir'     			=> "$opt_p",
-  'save_pattern_hits'  			=> $opt_loghits,
-  'dont_copy_prefs'   			=> 1,
-  'local_tests_only'   			=> $opt_net ? 0 : 1,
-  'only_these_rules'   			=> $opt_rules,
-  'ignore_safety_expire_timeout'	=> 1,
-  PREFIX				=> '',
-  DEF_RULES_DIR        			=> $opt_c,
-  LOCAL_RULES_DIR      			=> '',
-});
+my $local_rules_dir;
 
+if ($opt_dist) { # Set defaults
+  $opt_c ||= "$FindBin::Bin/../rules";
+  $opt_p ||= "$FindBin::Bin/mass-check.cf";
+  $opt_u ||= "$FindBin::Bin/spamassassin";
+  $opt_s ||= "$FindBin::Bin/spamassassin";
+  $local_rules_dir = '';
+}
+else {
+  if(!$opt_u) {
+    # Assuming this is OK, since mass-check isnt supported on windows, is it?
+    # Also, should there be some check to make sure that previous mass-check stuff isn't in there?
+    # AFAICT, there isn't otherwise....
+    if ( -d "${ENV{HOME}}/.spamassassin" ) {
+      $opt_u = "${ENV{HOME}}/.spamassassin/mass-check";
+      mkdir $opt_u, 0700 if (! -d $opt_u);
+    }
+  }
+
+# Leave the rest to SA, we'll get it afterwards
+
+}
+
+my $spamtest = new Mail::SpamAssassin ({
+				       'debug'              			=> $opt_debug,
+				       'rules_filename'     			=> $opt_c,
+				       'userprefs_filename' 			=> $opt_p,
+				       'site_rules_filename'			=> $opt_s,
+				       'userstate_dir'     			=> $opt_u,
+				       'save_pattern_hits'  			=> $opt_loghits,
+				       'dont_copy_prefs'   			=> 1,
+				       'local_tests_only'   			=> $opt_net ? 0 : 1,
+				       'only_these_rules'   			=> $opt_rules,
+				       'ignore_safety_expire_timeout'	=> 1,
+				       DEF_RULES_DIR        			=> $opt_c,
+				       LOCAL_RULES_DIR      			=> $local_rules_dir,
+				      });
+
 $spamtest->compile_now(1);
-$spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf");
+if ($opt_dist) {
+  $spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf");
+}
 
 my $who   = `id -un 2>/dev/null`;   chomp $who;
 my $where = `uname -n 2>/dev/null`; chomp $where;
 my $when  = `date -u`;              chomp $when;
-my $revision = "unknown";
-if (open(TESTING, "$opt_c/70_testing.cf")) {
-  chomp($revision = <TESTING>);
-  $revision =~ s/.*\$Rev:\s*(\S+).*/$1/;
-  close(TESTING);
+my $revision;
+
+if ($opt_dist) {
+  my $rev = "unknown";
+  if (open(TESTING, "$opt_c/70_testing.cf")) {
+    chomp($rev = <TESTING>);
+    $rev =~ s/.*\$Rev:\s*(\S+).*/$1/;
+    close(TESTING);
+  }
+  $revision = "SVN revision: $rev";
 }
+else {
+  $revision = "Local";
+}
+
 my $log_header = "# mass-check results from $who\@$where, on $when\n" .
 		 "# M:SA version ".$spamtest->Version()."\n" .
-		 "# SVN revision: $revision\n" .
+		 "# $revision\n" .
 		 "# Perl version: $] on $Config{archname}\n";
+
+if (!$opt_dist) {
+  my @paths = ( $spamtest->{rules_filename}, $spamtest->{site_rules_filename}, $spamtest->{userprefs_filename} );
+  $log_header .= "# Using configuration:\n";
+  foreach my $file (@paths) {
+    $log_header .=  "# $file\n";
+  }
+}
+
 my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || `hostname` || 'localhost';
 chomp $host;
 
@@ -222,7 +320,7 @@
     autoflush STDOUT 1;
     print STDOUT $log_header;
   }
-  else {
+  elsif ($opt_hamlog || $opt_spamlog) {
     open(HAM, "> $opt_hamlog");
     open(SPAM, "> $opt_spamlog");
     autoflush HAM 1;
@@ -230,6 +328,11 @@
     print HAM $log_header;
     print SPAM $log_header;
   }
+  else {
+    open(OUT, "> $opt_log");
+    autoflush OUT 1;
+    print OUT $log_header;
+  }
   $init_results = 1;
 }
 
@@ -239,25 +342,36 @@
   # don't open results files until we get here to avoid overwriting files
   &init_results if !$init_results;
 
-  if ($class eq "s") {
-    if ($opt_o) { print STDOUT $result; } else { print SPAM $result; }
-    $spam_count++;
+  if ($opt_o) {
+    print STDOUT $result;
   }
-  elsif ($class eq "h") {
-    if ($opt_o) { print STDOUT $result; } else { print HAM $result; }
-    $ham_count++;
+  elsif ($opt_spamlog || $opt_hamlog) {
+    if ($class eq "s") {
+      print SPAM $result;
+    } else {
+      print HAM $result;
+    }
   }
+  else {
+    print OUT $result;
+  }
 
   $total_count++;
 #warn ">> result: $total_count $class $time\n";
 
   if ($opt_progress) {
+    if ($class eq "s") {
+      $spam_count++;
+    }
+    else {
+      $ham_count++;
+    }
     progress($time);
   }
 }
 
 sub wanted {
-  my (undef, $id, $time, $dataref) = @_;
+  my ($class, $id, $time, $dataref) = @_;
   my $out;
 
   my $ma = $spamtest->parse($dataref, 1);
@@ -308,18 +422,22 @@
     push(@extra, "mid=$mid");
   }
 
-  my $yorn;
+  my $result;
   my $score;
   my $tests;
   my $extra;
 
   if ($opt_loguris) {
-    $yorn = '.';
+    $result = '.';
     $score = 0;
     $tests = join(" ", sort @uris);
     $extra = '';
   } else {
-    $yorn = $status->is_spam() ? 'Y' : '.';
+    if ($status->is_spam()) {
+      $result = "s";
+    } else {
+      $result = "h";
+    }
     $score = $status->get_score();
     $tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit())));
     $extra = join(",", @extra);
@@ -333,7 +451,7 @@
 
   $id =~ s/\s/_/g;
 
-  $out .= sprintf("%s %2d %s %s %s\n", $yorn, $score, $id, $tests, $extra);
+  $out .= sprintf("%s %s %05.2f %s %s %s\n", $class, $result, $score, $id, $tests, $extra);
 
   if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) {
     $out .= logkilled($ma, $id, "possible virus");
Index: mk-baseline-results
===================================================================
--- mk-baseline-results	(revision 10703)
+++ mk-baseline-results	(working copy)
@@ -10,7 +10,7 @@
 echo "Classification success on test corpora, at default threshold:"
 echo
 
-./logs-to-c --spam=spam-validate.log --nonspam=nonspam-validate.log --threshold 5 --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
+./fp-fn-statistics --logfile=masses-validate.log --threshold 5 --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
 
 echo
 echo "Results on test corpora at various alternative thresholds:"
@@ -18,7 +18,7 @@
 
 # list a wide range of thresholds, so that we can make graphs later ;)
 for thresh in -4 -3 -2 -1 0 1 2 3 4 4.5 5.5 6 6.5 7 8 9 10 12 15 17 20 ; do
-  ./logs-to-c --spam=spam-validate.log --nonspam=nonspam-validate.log --threshold $thresh --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
+  ./logs-to-c --logfile=masses-validate.log --threshold $thresh --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
   echo
 done
 
Index: README
===================================================================
--- README	(revision 10703)
+++ README	(working copy)
@@ -52,11 +52,11 @@
   This script is used to perform "mass checks" of a set of mailboxes, Cyrus
   folders, and/or MH mail spools.  It generates summary lines like this:
 
-  Y  7 /home/jm/Mail/Sapm/1382 SUBJ_ALL_CAPS,SUPERLONG_LINE,SUBJ_FULL_OF_8BITS
+  s s 07.22 /home/jm/Mail/Sapm/1382 SUBJ_ALL_CAPS,SUPERLONG_LINE,SUBJ_FULL_OF_8BITS
 
   or for mailboxes,
 
-  .  1 /path/to/mbox:<5.1.0.14.2.20011004073932.05f4fd28@localhost> TRACKER_ID,BALANCE_FOR_LONG
+  h h 01.32 /path/to/mbox:<5.1.0.14.2.20011004073932.05f4fd28@localhost> TRACKER_ID,BALANCE_FOR_LONG
 
   listing the path to the message or its message ID, its score, and the tests
   that triggered on that mail.
@@ -65,23 +65,24 @@
   get good hits with few false positives, etc., and re-score the tests to
   optimise the ratio.
 
-  This script relies on the spamassassin distribution directory living in "..".
+  If given the --dist option, this script relies on the spamassassin
+  distribution directory living in "..". If this script is not in the
+  distribution directory, it will generate logs based on the site-wide
+  rules, as well as personal rules.
 
 
 logs-to-c :
 
-  Takes the "spam.log" and "nonspam.log" files and converts them into C
-  source files and simplified data files for use by the C score optimization
-  algorithm.  (Called by "make" when you build the perceptron, so generally
-  you won't need to run it yourself.)
+  Takes the "masses.log" file and converts them into C source files
+  and simplified data files for use by the C score optimization
+  algorithm.  (Called by "make" when you build the perceptron, so
+  generally you won't need to run it yourself.)
 
-
 hit-frequencies :
 
   Analyses the log files and computes how often each test hits, overall,
   for spam mails and for non-spam.
 
-
 mk-baseline-results :
 
   Compute results for the baseline scores (read from ../rules/*).  If you
@@ -91,7 +92,6 @@
   It will output statistics on the current ruleset to ../rules/STATISTICS.txt,
   suitable for a release build of SpamAssassin.
 
-
 perceptron.c :
 
   Perceptron learner by Henry Stern.  See "README.perceptron" for details.
Index: fp-fn-statistics
===================================================================
--- fp-fn-statistics	(revision 10703)
+++ fp-fn-statistics	(working copy)
@@ -1,3 +1,146 @@
-#!/bin/sh
+#!/usr/bin/perl -w
+#
+# <@LICENSE>
+# Copyright 2004 Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+ 
+ 
+use FindBin;
+use lib "$FindBin::Bin/../lib";
+use Mail::SpamAssassin::Masses;
+use Getopt::Long;
+use strict;
+use warnings;
 
-exec ./logs-to-c --count $*
+use vars qw{$opt_c $opt_l $opt_s $opt_h $opt_t $opt_lambda};
+
+GetOptions("c|cffile=s@" => \$opt_c, "l|logfile=s" => \$opt_l,
+"s|scoreset=i" => \$opt_s, "h|help" => \$opt_h, "t|threshold" =>
+\$opt_t, "lambda" => \$opt_lambda);
+
+
+# Need to add rule for scores file 
+sub usage {
+  die "logs-to-c [-c rules dir] [-l log file] [-s SS] [-t]
+
+    -c,--cffile   p   use p as rules directory
+    -s,--scoreset SS  use scoreset SS
+    -l,--log      log read logs from log instead of from masses.log
+    -t,--threshold n  use a threshold of n (default: 5)
+    --lambda=n        use a lambda value of n (default: 5)
+";
+}
+
+
+usage() if $opt_h;
+
+$opt_c ||= "$FindBin::Bin/../rules";
+$opt_t = (defined($opt_t) ? $opt_t : 5);
+$opt_s ||= 0; #|
+$opt_l ||= "masses.log";
+$opt_lambda ||= 5;
+
+print "$opt_c used for config";
+
+my $nybias = 10;
+
+
+my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
+                                               scoreset => $opt_s, # ,,
+                                               logfile => $opt_l});
+
+$masses->readlogs();
+
+my $logs = $masses->get_logs();
+
+my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore) = (0,0,0,0,0,0,0,0);
+
+my $num_spam = $masses->get_num_spam();
+my $num_ham = $masses->get_num_ham();
+my $num_logs = $num_spam + $num_ham;
+
+my $count = 0;
+
+my $score;
+
+foreach my $log (@$logs) {
+
+  $score = 0;
+  foreach my $test (@{$log->{tests_hit}}) {
+
+    next if ($test->{issubrule});
+    next if (!$test->{score});
+
+    $score += $test->{score};
+
+  }
+
+  if ($score >= $opt_t) {
+    if ($log->{isspam}) {
+      $ga_yy++;
+      $yyscore += $score;
+    }
+    else {
+      $ga_ny++;
+      $nyscore += $score;
+    }
+  } else {
+    if ($log->{isspam}) {
+      $ga_yn++;
+      $ynscore += $score;
+    }
+    else {
+      $ga_nn++;
+      $nnscore += $score;
+    }
+  }
+}
+
+$nybias = $nybias * ($num_spam / $num_ham);
+
+my $fprate = ($ga_ny / $num_logs) * 100.0;
+my $fnrate = ($ga_yn / $num_logs) * 100.0;
+
+printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_t);
+printf "# Correctly non-spam: %6d  %4.2f%%  (%4.2f%% of non-spam corpus)\n", $ga_nn,
+  ($ga_nn /  $num_logs) * 100.0, ($ga_nn /  $num_ham) * 100.0;
+printf "# Correctly spam:     %6d  %4.2f%%  (%4.2f%% of spam corpus)\n" , $ga_yy,
+  ($ga_yy /  $num_logs) * 100.0, ($ga_yy /  $num_spam) * 100.0;
+printf "# False positives:    %6d  %4.2f%%  (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny,
+  $fprate, ($ga_ny /  $num_ham) * 100.0, $nyscore*$nybias;
+printf "# False negatives:    %6d  %4.2f%%  (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn,
+  $fnrate, ($ga_yn /  $num_spam) * 100.0, $ynscore;
+
+# convert to the TCR metrics used in the published lit
+my $nspamspam = $ga_yy;
+my $nspamlegit = $ga_yn;
+my $nlegitspam = $ga_ny;
+my $nlegitlegit = $ga_yn;
+my $nlegit = $num_ham;
+my $nspam = $num_spam;
+
+my $werr = ($opt_lambda * $nlegitspam + $nspamlegit)
+  / ($opt_lambda * $nlegit + $nspam);
+
+my $werr_base = $nspam
+  / ($opt_lambda * $nlegit + $nspam);
+
+$werr ||= 0.000001;     # avoid / by 0
+my $tcr = $werr_base / $werr;
+
+my $sr = ($nspamspam / $nspam) * 100.0;
+my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
+printf "# TCR: %3.6f  SpamRecall: %3.3f%%  SpamPrec: %3.3f%%  FP: %3.2f%%  FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate;
+
Index: extract-message-from-mbox
===================================================================
--- extract-message-from-mbox	(revision 10703)
+++ extract-message-from-mbox	(working copy)
@@ -68,7 +68,7 @@
 
 sub masscheck {
   while (<STDIN>) {
-    my $mail = (split(/\s+/, $_))[2];
+    my $mail = (split(/\s+/, $_))[3];
     $mail =~ tr/_/ /;
     if ($mail =~ /^(.*)\.(\d+)$/) {
       extract($1, $2);
Index: logs-to-c
===================================================================
--- logs-to-c	(revision 10703)
+++ logs-to-c	(working copy)
@@ -16,257 +16,243 @@
 # limitations under the License.
 # </@LICENSE>
 
-use Getopt::Long;
-use vars qw($opt_cffile $opt_count $opt_lambda $opt_threshold
-		$opt_spam $opt_nonspam);
+=head1 NAME
 
-GetOptions("cffile=s", "count", "lambda=f", "threshold=f", "spam=s", "nonspam=s", "scoreset=i");
-my $argcffile = $opt_cffile;
+logs-to-c - Convert a mass-check log into perceptron format
 
-my $justcount = 0;
-if ($opt_count) { $justcount = 1; }
+=head1 SYNOPSIS
 
-my $threshold = 5;
-if (defined $opt_threshold) { $threshold = $opt_threshold; }
+logs-to-c [options]
 
-$opt_spam ||= 'spam.log';
-$opt_nonspam ||= 'ham.log';
-$opt_scoreset = 0 if ( !defined $opt_scoreset );
+ Options:
+    -c,--cffile=path	  Use path as the rules directory
+    -s,--scoreset=n	  Use scoreset n
+    -l,--logfile=file	  Read in file instead of masses.log
+    -o,--outputdir        Put output in the specified dir (default tmp/)
 
-my $nybias = 10;
+=head1 DESCRIPTION
 
-# lambda value for TCR equation, indicating the "cost" of recovering
-# from an FP.  The values are: 1 = tagged only, 9 = mailed back to
-# sender asking for token (TMDA style), 999 = deleted outright.
-# We (SpamAssassin) use a default of 5, representing "moved to
-# infrequently-read folder".
+B<logs-to-c> will read the mass-check log F<masses.log> or as
+specified by the B<--logfile> option, and convert it into the format
+needed by the perceptron. This is a format that is simple for the
+perceptron to parse, but is not very readable to humans.
 
-my $lambda = 5;
-if ($opt_lambda) { $lambda = $opt_lambda; }
+By default, output will be put in the directory ./tmp/ unless another
+directory is specified by the B<--outputdir> option. (Note: at the
+current time, this must be /tmp/ in order for the perceptron to
+compile properly.)
 
-my %is_spam = ();
-my %tests_hit = ();
-my %mutable_tests = ();
+=head1 BUGS
 
-use vars qw(%rules %allrules);
+Please report bugs to http://bugzilla.spamassassin.org/
 
-readscores();
+=head1 SEE ALSO
 
-print "Reading per-message hit stat logs and scores...\n";
-my ($num_tests, $num_spam, $num_nonspam);
-my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore);
+L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
 
-readlogs();
-read_ranges();
+=cut
 
-if ($justcount) {
-  $nybias = $nybias*($num_spam / $num_nonspam);
-  evaluate();
-} else {
-  print "Writing logs and current scores as C code...\n";
-  writescores_c();
+use FindBin;
+use lib "$FindBin::Bin/../lib";
+use Mail::SpamAssassin::Masses;
+use Getopt::Long qw(:config bundling auto_help);
+use strict;
+use warnings;
+
+use vars qw{$opt_c $opt_l $opt_s $opt_o};
+
+GetOptions("c|cffile=s@" => \$opt_c,
+	   "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
+	   "l|logfile=s" => \$opt_l,
+	   "o|output=s" => \$opt_o);
+
+
+$opt_o ||= "./tmp/";
+if (!-d $opt_o) {
+  mkdir $opt_o, 0777 or die "Can't mkdir $opt_o";
 }
-exit 0;
 
+if (!$opt_c || !scalar(@$opt_c)) {
+    # Try to read this in from the log, if possible
+    open IN, $opt_l or die "Can't open $opt_l: $!";
+    my $files = 0; # are we in the files section?
+    while(<IN>) {
+	if (!$files) {
+	    if (/^\# SVN revision:/) {
+		$opt_c = [ "$FindBin::Bin/../rules" ];
+		last;
+	    } elsif (/^\# Using configuration:$/) {
+		$files = 1;
+	    }
+	} elsif (/^\#\s+(.*)\s*$/) {
+	    push (@$opt_c, $1);
+	} else {
+	    # All done!
+	    last;
+	}
+    }
 
-sub readlogs {
-  my $count = 0;
-  $num_spam = $num_nonspam = 0;
+    foreach my $file (@$opt_c) {
+	die "Can't read $file" unless -r $file;
+    }
+}
 
-  if ($justcount) {
-    $ga_yy = $ga_ny = $ga_yn = $ga_nn = 0;
-    $yyscore = $ynscore = $nyscore = $nnscore = 0.0;
-  }
+# ignore rules that are subrules -- we don't generate scores for them...
 
-  foreach my $file ($opt_spam, $opt_nonspam) {
-    open (IN, "<$file");
+# Note: this will cause a difference over the old logs-to-c since rank
+# is dependent on the frequencies of all rules, not just non-subrules
 
-    while (<IN>) {
-      next if /^\#/;
-      next if /^$/;
-      if($_ !~ /^.\s+([-\d]+)\s+\S+\s*/) { warn "bad line: $_"; next; }
-      my $hits = $1;
-#my $foo = $_;
-      $_ = $'; s/(?:bayes|time)=\S+//; s/,,+/,/g; s/^\s+//; s/\s+$//;
+my $greprules = sub { return 0 if $_[1]->{issubrule}; return 1; };
 
-      my $score = 0;
-      my @tests = ();
-      foreach my $tst (split (/,/, $_)) {
-	next if ($tst eq '');
-	if (!defined $scores{$tst}) {
-          #warn "unknown test in $file, ignored: $tst\n";
-	  next;
-	}
+$opt_s ||= 0; # |
 
-	# Make sure to skip any subrules!
-	next if ( $allrules{$tst}->{issubrule} );
+my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
+					       scoreset => $opt_s, # ,,
+					       logfile => $opt_l,
+                                               greprules => $greprules });
 
-        if ($justcount) {
-          $score += $scores{$tst};
-        } else {
-          push (@tests, $tst);
-        }
-      }
+$masses->readlogs();
+$masses->do_score_ranges();
 
-      if (!$justcount) { 
-        $tests_hit{$count} = \@tests;
-      }
+my $rules = $masses->get_rules_array();
+my $logs = $masses->get_logs();
 
-      if ($file eq $opt_spam) {
-	$num_spam++;
-        if ($justcount) {
-          if ($score >= $threshold) {
-            $ga_yy++; $yyscore += $score;
-          } else {
-            $ga_yn++; $ynscore += $score;
-          }
-        } else {
-          $is_spam{$count} = 1;
-        }
-      } else {
-	$num_nonspam++;
-        if ($justcount) {
-          if ($score >= $threshold) {
-#print "$score -- $foo";
-            $ga_ny++; $nyscore += $score;
-          } else {
-            $ga_nn++; $nnscore += $score;
-          }
-        } else {
-          $is_spam{$count} = 0;
-        }
-      }
-      $count++;
-    }
-    close IN;
-  }
-  $num_tests = $count;
-}
+my @index_to_rule;
+my $num_spam = $masses->get_num_spam();
+my $num_ham = $masses->get_num_ham();
 
+# This is misleading -- num_tests is really num_msgs
+my $num_tests = $num_spam + $num_ham;
 
-sub readscores {
-  if (!defined $argcffile) { $argcffile = "../rules"; }
-  print "Reading scores from \"$argcffile\"...\n";
-  system ("./parse-rules-for-masses -d \"$argcffile\" -s $opt_scoreset") and die;
-  require "./tmp/rules.pl";
-  %allrules = %rules;           # ensure it stays global
-}
 
+# Write logs and scores as C code
+writescores_c();
+writetests_c();
+
+
 sub writescores_c {
-  my $output = '';
-  my $size = 0;
+
   my $mutable = 0;
-  my $i;
+  my $output = '';
+  my $count = 0;
+  my $score = 0;
 
-    # jm: now, score-ranges-from-freqs has tflags to work from, so
-    # it will always list all mutable tests.
+  foreach my $rule (sort {($b->{ismutable} <=> $a->{ismutable}) ||
+			  ($a->{name} cmp $b->{name}) } @$rules) {
 
-  @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) ||
-			  ($mutable_tests{$b} <=> $mutable_tests{$a}) ||
-			   ($a cmp $b)} (keys %scores);
-  my $max_hits_per_msg = 0;
-  for ($file = 0; $file < $num_tests; $file++) {
-    my(@hits) =
-     grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (@{$tests_hit{$file}});
-    if ((scalar(@hits)+1) > $max_hits_per_msg) {
-      $max_hits_per_msg = scalar(@hits)+1;
-    }
-  }
+    $score = $rule->{score};
 
-  for ($i = 0; $i <= $#index_to_rule; $i++) {
-    my $name = $index_to_rule[$i];
-    $rule_to_index{$name} = $i;
+    # ignored rules (i.e. no scores)
+    next unless $score;
 
-    if ($ignored_rule{$name}) { next; }
+    # also ignore rules with score range 0
+    next if (!$rule->{range_lo} && !$rule->{range_hi});
 
-    if ($mutable_tests{$name} == 0) {
-      $range_lo{$name} = $range_hi{$name} = $scores{$name};
-    } else {
+    # Set an index
+    $rule->{index} = $count;
+    $index_to_rule[$count] = $rule; # add the reference to the array
+
+    if ($rule->{ismutable}) {
       $mutable++;
-      if ($range_lo{$name} > $range_hi{$name}) {
-	($range_lo{$name},$range_hi{$name}) =
-	 ($range_hi{$name},$range_lo{$name});
+      if ($score > $rule->{range_hi}) {
+	$score = $rule->{range_hi} - 0.001;
+      } elsif ($score < $rule->{range_lo}) {
+	$score = $rule->{range_lo} + 0.001;
       }
-      #$range_lo{$name} ||= 0.1;
-      #$range_hi{$name} ||= 1.5;
     }
+    # These should all be set properly if not mutable
+    # score = range_lo = range_hi
+    else {
+      warn "hi != lo for " . $rule->{name} . "!" if $rule->{range_lo} != $rule->{range_hi};
+      $score = $rule->{range_hi} = $rule->{range_lo};
+    }
 
-    $output .= ".".$i."\n".
-                "n".$name."\n".
-                "b".$scores{$name}."\n".
-                "m".$mutable_tests{$name}."\n".
-                "l".$range_lo{$name}."\n".
-                "h".$range_hi{$name}."\n";
-    $size++;
+    $output .= "." . $count . "\n" .
+         "n" . $rule->{name} . "\n" .
+	 "b" . $score . "\n" .
+	 "m" . $rule->{ismutable} . "\n" .
+	 "l" . $rule->{range_lo} . "\n" .
+	 "h" . $rule->{range_hi} . "\n";
+
+    $count++;
+
   }
 
+  # Output this
 
-  open (DAT, ">tmp/scores.data");
-  print DAT "N$size\n", "M$mutable\n", # informational only
-   $output;
+  open (DAT, ">$opt_o/scores.data");
+  print DAT "N$count\n", "M$mutable\n"; # informational
+  print DAT $output;
   close DAT;
 
-  open (OUT, ">tmp/scores.h");
-  print OUT "
+  open (OUT, ">$opt_o/scores.h");
+  print OUT <<EOF;
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
-
-int num_scores = $size;
+ 
+int num_scores = $count;
 int num_mutable = $mutable;
-unsigned char is_mutable[$size];
-double range_lo[$size];
-double range_hi[$size];
-double bestscores[$size];
-char *score_names[$size];
-double tmp_scores[$size][2];
+unsigned char is_mutable[$count];
+double range_lo[$count];
+double range_hi[$count];
+double bestscores[$count];
+char *score_names[$count];
+double tmp_scores[$count][2];
 unsigned char ny_hit[$mutable];
 unsigned char yn_hit[$mutable];
-
+ 
 double lookup[$mutable];
-
+ 
 /* readscores() is defined in tests.h */
+EOF
 
-";
   close OUT;
 
-  writetests_c($max_hits_per_msg); # make sure $rule_to_index is around
 }
 
+
 sub writetests_c {
-  my $max_hits_per_msg = $_[0];
 
-  my(%uniq_files) = ();
-  my(%count_keys) = ();
-  my(%file_key) = ();
+  my $max_hits_per_msg = 0;
+  my @goodtests;
+  my %uniq_logs;
+  my $uniq_key;
 
-  my $file;
+  my $i = 0;
 
-  for ($file = 0; $file < $num_tests; $file++)
-  {
-    my $uniq_key = $is_spam{$file} . " ";
+  # This will "compress" the logs so that one log entry can have a
+  # "count" of n indicating it reprents n similar messages
 
-    my(@good_tests) =
-     grep {length($_) && (! $ignored_rule{$_}) &&
-	    (defined($rule_to_index{$_}))} (@{ $tests_hit{$file} });
+  foreach my $log (@$logs) {
 
-    @good_tests = sort {$a <=> $b} (map {$rule_to_index{$_}} (@good_tests));
+    (@goodtests) = grep {exists($_->{index})} (@{$log->{tests_hit}});
+    @goodtests = sort {$a <=> $b} map {$_->{index}} @goodtests;
 
-    $uniq_key .= join(" ",@good_tests);
+    if($max_hits_per_msg < scalar(@goodtests)) {
+      $max_hits_per_msg = scalar(@goodtests);
+    }
 
-    if (exists($count_keys{$uniq_key})) {
-      $count_keys{$uniq_key}++;
+    $uniq_key = $log->{isspam} ? "s" : "";
+    $uniq_key .= join(" ", @goodtests);
+
+
+    # The %count_keys hash's entries will be the log info for each unique log
+    # $log->{count} is increased to indicate similar logs
+
+    if (exists($uniq_logs{$uniq_key})) {
+      $uniq_logs{$uniq_key}->{count}++;
     } else {
-      $count_keys{$uniq_key} = 1;
-      $file_key{$file} = $uniq_key;
-      $uniq_files{$file} = scalar(keys(%count_keys)) - 1;
+      $uniq_logs{$uniq_key} = $log;
+      $uniq_logs{$uniq_key}->{count} = 1;
     }
+
   }
 
-  my $num_nondup = scalar(keys(%uniq_files));
+  my $num_nondup = scalar(keys %uniq_logs);
 
-  open (TOP, ">tmp/tests.h");
-  print TOP "
+  open TOP, ">$opt_o/tests.h";
+  print TOP <<EOF;
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
@@ -274,7 +260,7 @@
 int num_tests = $num_tests;
 int num_nondup = $num_nondup;
 int num_spam = $num_spam;
-int num_nonspam = $num_nonspam;
+int num_nonspam = $num_ham;
 int max_hits_per_msg = $max_hits_per_msg;
 unsigned char num_tests_hit[$num_nondup];
 unsigned char is_spam[$num_nondup];
@@ -282,196 +268,77 @@
 double scores[$num_nondup];
 double tmp_total[$num_nondup];
 int tests_count[$num_nondup];
+EOF
 
-";
-  $_ = join ('', <DATA>);
-  print TOP $_;
+
+  print TOP join('', <DATA>);
   close TOP;
 
-  open (DAT, ">tmp/tests.data");
 
-  foreach $file (sort {$a <=> $b} (keys %uniq_files)) {
-    print DAT ".".$uniq_files{$file}."\n";
+  open (DAT, ">$opt_o/tests.data");
 
-    my $out = '';
-    $out .= "s".$is_spam{$file}."\n";
+  my $out;
+  my $base_score;
+  my $num_tests_hit;
 
-    my $base_score = 0;
-    my $num_tests_hit = 0;
-    foreach my $test (@{$tests_hit{$file}}) {
-      if ($test eq '') { next; }
+  $i = 0;
+  foreach my $log (values %uniq_logs) {
+    $out = '';
+    $base_score = $num_tests_hit = 0;
 
-      if ($ignored_rule{$test}) {
-        warn "ignored rule $test got a hit in $file!\n";
-        next;
+    print DAT "." . $i . "\n";
+
+    $out .= "s" . ( ($log->{isspam})? 1 : 0 ) . "\n";
+
+    foreach my $test (@{$log->{tests_hit}}) {
+      if (!$test->{score}) {
+	# Don't really know why this happens, but the old logs-to-c
+	#did it too
+
+	warn "ignored rule " . $test->{name} . " got a hit!";
+	next;
       }
 
-      if (!defined $rule_to_index{$test}) {
-	warn "test with no C index: $test\n";
+      if (!$test->{range_lo} && !$test->{range_hi}) {
+	# We ignored this rule
 	next;
       }
 
-      if ($mutable_tests{$test}) {
-      $num_tests_hit++;
-      $out .= "t".$rule_to_index{$test}."\n";
-
-      if ($num_tests_hit >= $max_hits_per_msg) {
-	die "Need to increase \$max_hits_per_msg";
+      # debugging...
+      if (!defined $test->{index}) {
+	warn "test with no index";
       }
-      } else {
-	$base_score += $scores{$test};
-      }
-    }
 
-    $out .= "b" . $base_score . "\n"; # score to add in for non-mutable tests
-    $out .= "c" . $count_keys{$file_key{$file}} . "\n";
+      if ($test->{ismutable}) {
+	$num_tests_hit++;
+	$out .= "t".$test->{index}."\n";
 
-    print DAT "n".$num_tests_hit."\n".$out;
-  }
-  close DAT;
-}
+	if ($num_tests_hit >= $max_hits_per_msg) {
+	  die "\$max_hits_per_msg not big enough!";
+	}
 
-sub read_ranges {
-  if (!-f 'tmp/ranges.data') {
-    system ("make tmp/ranges.data");
-  }
+      }
+      else {
+	$base_score += $test->{score};
+      }
 
-  # read ranges, and mutableness, from ranges.data.
-  open (IN, "<tmp/ranges.data")
-  	or die "need to run score-ranges-from-freqs first!";
-
-  my $count = 0;
-  while (<IN>) {
-    /^(\S+) (\S+) (\d+) (\S+)$/ or next;
-    my $t = $4;
-    $range_lo{$t} = $1+0;
-    $range_hi{$t} = $2+0;
-    my $mut = $3+0;
-
-    if ($allrules{$t}->{issubrule}) {
-      $ignored_rule{$t} = 1;
-      $mutable_tests{$t} = 0;
-      next;
     }
-    if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) {
-      #warn "ignored rule: score and range == 0: $t\n";
-      $ignored_rule{$t} = 1;
-      $mutable_tests{$t} = 0;
-      next;
-    }
 
-    $ignored_rule{$t} = 0;
-    $index_to_rule[$count] = $t;
-    $count++;
+    $out .= "b" . $base_score . "\n"; # score to add for non-mutable tests
+    $out .= "c" . $log->{count} . "\n"; # number of identical logs
 
-    if (!$mut) {
-      $mutable_tests{$t} = 0;
-    } elsif ($range_lo{$t} == $range_hi{$t}) {
-      $mutable_tests{$t} = 0;
-    } elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
-      $mutable_tests{$t} = 0;
-    } else {
-      $mutable_tests{$t} = 1;
-    }
-    unless ($mutable_tests{$t} || $scores{$t}) {
-      $ignored_rule{$t} = 1;
-    }
-  }
-  close IN;
+    print DAT "n" . $num_tests_hit . "\n" . $out;
 
-  # catch up on the ones missed; seems to be userconf or 0-hitters mostly.
-  foreach my $t (sort keys %allrules) {
-    next if (exists($range_lo{$t}));
-    if ($allrules{$t}->{issubrule}) {
-      $ignored_rule{$t} = 1;
-      $mutable_tests{$t} = 0;
-      next;
-    }
-    $ignored_rule{$t} = 0;
-    unless (exists($mutable_tests{$t}) &&
-	    ($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) {
-      $mutable_tests{$t} = 0;
-    }
-    unless ($mutable_tests{$t} || $scores{$t}) {
-      $ignored_rule{$t} = 1;
-    }
-    $index_to_rule[$count] = $t;
-    $count++;
+    $i++;
   }
-  foreach my $t (keys %range_lo) {
-    next if ($ignored_rule{$t});
-    if ($mutable_tests{$t}) {
-      if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
-	$scores{$t} = -1;
-      } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
-	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
-	$scores{$t} = -0.01;
-      }
-      if ($scores{$t} >= $range_hi{$t}) {
-	$scores{$t} = $range_hi{$t} - 0.001;
-      } elsif ($scores{$t} <= $range_lo{$t}) {
-	$scores{$t} = $range_lo{$t} + 0.001;
-      }
-    } else {
-      if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
-	next;
-      } elsif ($range_lo{$t} == $range_hi{$t}) {
-	$scores{$t} = $range_lo{$t};
-	next;
-      }
-      if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
-	$scores{$t} = -1;
-      } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
-	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
-	$scores{$t} = -0.01;
-      }
-      if ($scores{$t} > $range_hi{$t}) {
-	$scores{$t} = $range_hi{$t};
-      } elsif ($scores{$t} < $range_lo{$t}) {
-	$scores{$t} = $range_lo{$t};
-      }
-    }
-  }
-}
 
-sub evaluate {
-   my $fprate = ($ga_ny / $num_tests) * 100.0;
-   my $fnrate = ($ga_yn / $num_tests) * 100.0;
+  close DAT;
 
-   printf ("\n# SUMMARY for threshold %3.1f:\n", $threshold);
-   printf "# Correctly non-spam: %6d  %4.2f%%  (%4.2f%% of non-spam corpus)\n", $ga_nn,
-       ($ga_nn /  $num_tests) * 100.0, ($ga_nn /  $num_nonspam) * 100.0;
-   printf "# Correctly spam:     %6d  %4.2f%%  (%4.2f%% of spam corpus)\n" , $ga_yy,
-       ($ga_yy /  $num_tests) * 100.0, ($ga_yy /  $num_spam) * 100.0;
-   printf "# False positives:    %6d  %4.2f%%  (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny,
-       $fprate, ($ga_ny /  $num_nonspam) * 100.0, $nyscore*$nybias;
-   printf "# False negatives:    %6d  %4.2f%%  (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn,
-       $fnrate, ($ga_yn /  $num_spam) * 100.0, $ynscore;
 
-  # convert to the TCR metrics used in the published lit
-  my $nspamspam = $ga_yy;
-  my $nspamlegit = $ga_yn;
-  my $nlegitspam = $ga_ny;
-  my $nlegitlegit = $ga_yn;
-  my $nlegit = $num_nonspam;
-  my $nspam = $num_spam;
-
-  my $werr = ($lambda * $nlegitspam + $nspamlegit)
-                  / ($lambda * $nlegit + $nspam);
-
-  my $werr_base = $nspam
-                  / ($lambda * $nlegit + $nspam);
-
-  $werr ||= 0.000001;     # avoid / by 0
-  my $tcr = $werr_base / $werr;
-
-  my $sr = ($nspamspam / $nspam) * 100.0;
-  my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
-  printf "# TCR: %3.6f  SpamRecall: %3.3f%%  SpamPrec: %3.3f%%  FP: %3.2f%%  FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate;
 }
 
-__DATA__
 
+__DATA__
 void loadtests (void) {
   FILE *fin = fopen ("tmp/tests.data", "r");
   char buf[256];
@@ -557,4 +424,3 @@
 
   printf ("Read scores for %d tests.\n", num_scores);
 }
-
Index: score-ranges-from-freqs
===================================================================
--- score-ranges-from-freqs	(revision 10703)
+++ score-ranges-from-freqs	(working copy)
@@ -1,251 +0,0 @@
-#!/usr/bin/perl -w
-#
-# <@LICENSE>
-# Copyright 2004 Apache Software Foundation
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# </@LICENSE>
-
-# (rough) graphic demo of this algorithm:
-# 0.0  = -limit [......] 0 ........ limit
-# 0.25 = -limit ..[..... 0 .]...... limit
-# 0.5  = -limit ....[... 0 ...].... limit
-# 0.75 = -limit ......[. 0 .....].. limit
-# 1.0  = -limit ........ 0 [......] limit
-my $sliding_window_limits = 4.8; # limits = [-$range, +$range]
-my $sliding_window_size =   5.5; # scores have this range within limits
-
-# 0.0  = -limit [......] 0 ........ limit
-# 0.25 = -limit ....[... 0 ]....... limit
-# 0.5  = -limit ......[. 0 .]...... limit (note: tighter)
-# 0.75 = -limit .......[ 0 ...].... limit
-# 1.0  = -limit ........ 0 [......] limit
-my $shrinking_window_lower_base =   0.00; 
-my $shrinking_window_lower_range =  1.00; # *ratio, added to above
-my $shrinking_window_size_base =    1.00;
-my $shrinking_window_size_range =   1.00; # *ratio, added to above
-
-my $use_sliding_window = 0;
-
-my $argcffile = shift @ARGV;
-my $scoreset = shift @ARGV;
-$scoreset = 0 if ( !defined $scoreset );
-
-if (defined ($argcffile) && $argcffile eq '-test') {
-  # use this to debug the ranking -> score-range mapping:
-  for $rat (0.0, 0.25, 0.5, 0.75, 1.0) {
-    my ($lo, $hi); if ($use_sliding_window) {
-      ($lo, $hi) = sliding_window_ratio_to_range($rat);
-    } else {
-      ($lo, $hi) = shrinking_window_ratio_to_range($rat);
-    }
-    warn "test: $rat => [ $lo $hi ]\n";
-  } exit;
-}
-
-my %freq_spam = ();
-my %freq_nonspam = ();
-
-my $num_spam;
-my $num_nonspam;
-my $num_total;
-
-my %mutable_tests = ();
-my %ranking = ();
-my %soratio = ();
-my %is_nice = ();
-
-if (!defined $argcffile) { $argcffile = "../rules"; }
-system ("./parse-rules-for-masses -d \"$argcffile\" -s $scoreset") and die;
-if (-e "tmp/rules.pl") {
-  # Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem
-  require "./tmp/rules.pl";
-}
-else {
-  die "parse-rules-for-masses had no error but no tmp/rules.pl!?!";
-}
-
-while (<>) {
-  /^\s*([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+\S+\s+(.+)\s*$/ or next;
-
-  my $overall = $1+0;
-  my $spam = $2+0;
-  my $nonspam = $3+0;
-  my $soratio = $4+0;
-  my $ranking = $5+0;
-  my $test = $6;
-
-  if ($test eq '(all messages)') {
-    $num_spam = $spam;
-    $num_nonspam = $nonspam;
-    $num_total = $spam+$nonspam;
-    next;
-  }
-  next if ($test eq '(all messages as %)');
-
-  if (!defined ($rules{$test})) {
-    warn "rule $test no longer exists; ignoring\n";
-    next;
-  }
-
-  $freq{$test} = $overall;
-  $freq_spam{$test} = $spam;
-  $freq_nonspam{$test} = $nonspam;
-
-  my $tflags = $rules{$test}->{tflags}; $tflags ||= '';
-  if ($tflags =~ /\buserconf\b/ ||
-      ( ($scoreset % 2) == 0 && $tflags =~ /\bnet\b/ )) {
-    $mutable_tests{$test} = 0;
-  } else {
-    $mutable_tests{$test} = 1;
-  }
-  if ($tflags =~ m/\bnice\b/i) {
-    $is_nice{$test} = 1;
-  } else {
-    $is_nice{$test} = 0;
-  }
-
-  if ($overall < 0.01) {        # less than 0.01% of messages were hit
-    $mutable_tests{$test} = 0;
-    $soratio{$test} = 0.5;
-    $ranking{$test} = 0.0;
-    $rules{$test}->{score} = 0; # tvd - disable these rules automagically
-
-  } else {
-    $soratio{$test} = $soratio;
-    $ranking{$test} = $ranking;
-  }
-}
-
-if ( ! mkdir "tmp", 0755 ) {
-  warn "Couldn't create tmp directory!: $!\n";
-}
-
-open (OUT, ">tmp/ranges.data");
-foreach my $test (sort { $ranking{$b} <=> $ranking{$a} } keys %freq) {
-  if (!defined ($rules{$test})) {
-    warn "no rule $test";
-    print OUT ("0 0 0 $test\n");
-    next;
-  }
-
-  my $overall = $freq{$test};
-  my $spam = $freq_spam{$test};
-  my $nonspam = $freq_nonspam{$test};
-  my $soratio = $soratio{$test};
-  my $ranking = $ranking{$test};
-  my $mutable = $mutable_tests{$test};
-
-  if (!$mutable || $rules{$test}->{score} == 0) { # didn't look for score 0 - tvd
-    printf OUT ("%3.3f %3.3f 0 $test\n",
-                         $rules{$test}->{score},
-                         $rules{$test}->{score});
-    next;
-  }
-
-  # 0.0 = best nice, 1.0 = best nonnice
-  if ($is_nice{$test}) {
-    $ranking = .5 - ($ranking / 2);
-  } else {
-    $ranking = .5 + ($ranking / 2);
-  }
-
-  my ($lo, $hi);
-  if ($use_sliding_window) {
-    ($lo, $hi) = sliding_window_ratio_to_range($ranking);
-  } else {
-    ($lo, $hi) = shrinking_window_ratio_to_range($ranking);
-  }
-
-  # tvd
-  my $tflags = $rules{$test}->{tflags}; $tflags ||= '';
-  if ( $is_nice{$test} && ( $ranking < .5 ) ) { # proper nice rule
-    if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score # -5.4
-      $lo *=1.8;
-    }
-    elsif ($soratio <= 0.05 && $nonspam > 0.5) { # let good rules be larger if they want to, -4.5
-      $lo *= 1.5;
-    }
-
-    $hi =	($soratio == 0) ? $lo :
-    		($soratio <= 0.005 ) ? $lo/1.1 :
-    		($soratio <= 0.010 && $nonspam > 0.2) ? $lo/2.0 :
-		($soratio <= 0.025 && $nonspam > 1.5) ? $lo/10.0 :
-		0;
-
-    if ( $soratio >= 0.35 ) { # auto-disable bad rules
-      ($lo,$hi) = (0,0);
-    }
-  }
-  elsif ( !$is_nice{$test} && ( $ranking >= .5 ) ) { # proper spam rule
-    if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score
-      $hi *=1.8;
-    }
-    elsif ( $soratio >= 0.99 && $spam > 1.0 ) {
-      $hi *= 1.5; # let good rules be larger if they want to
-    }
-
-    $lo =	($soratio == 1) ? $hi:
-    		($soratio >= 0.995 ) ? $hi/4.0 :
-    		($soratio >= 0.990 && $spam > 1.0) ? $hi/8.0 :
-		($soratio >= 0.900 && $spam > 10.0) ? $hi/24.0 :
-		0;
-
-    if ( $soratio <= 0.65 ) { # auto-disable bad rules
-      ($lo,$hi) = (0,0);
-    }
-  }
-  else { # rule that has bad nice setting
-    ($lo,$hi) = (0,0);
-  }
-  $mutable = 0 if ( $hi == $lo );
-
-  printf OUT ("%3.1f %3.1f $mutable $test\n", $lo, $hi);
-}
-close OUT;
-exit;
-
-sub sliding_window_ratio_to_range {
-  my $ratio = shift;
-  my $lo = -$sliding_window_limits + ($sliding_window_size * $ratio);
-  my $hi = +$sliding_window_limits - ($sliding_window_size * (1-$ratio));
-  if ($lo > $hi) { # ???
-    ($lo,$hi) = ($hi,$lo);
-  }
-  ($lo, $hi);
-}
-
-sub shrinking_window_ratio_to_range {
-  my $ratio = shift;
-  my $is_nice = 0;
-  my $adjusted = ($ratio -.5) * 2;      # adj [0,1] to [-1,1]
-  if ($adjusted < 0) { $is_nice = 1; $adjusted = -$adjusted; }
-
-#$adjusted /= 1.5 if ( $ratio < 0.95 && $ratio > 0.15 ); # tvd
-
-  my $lower = $shrinking_window_lower_base 
-                        + ($shrinking_window_lower_range * $adjusted);
-  my $range = $shrinking_window_size_base 
-                        + ($shrinking_window_size_range * $adjusted);
-  my $lo = $lower;
-  my $hi = $lower + $range;
-  if ($is_nice) {
-    my $tmp = $hi; $hi = -$lo; $lo = -$tmp;
-  }
-  if ($lo > $hi) { # ???
-    ($lo,$hi) = ($hi,$lo);
-  }
-
-  ($lo, $hi);
-}
-
Index: find-extremes
===================================================================
--- find-extremes	(revision 10703)
+++ find-extremes	(working copy)
@@ -17,38 +17,144 @@
 # limitations under the License.
 # </@LICENSE>
 
-use Getopt::Std;
-getopts("l:L:h");
 
+use FindBin;
+use lib "$FindBin::Bin/../lib";
+use Mail::SpamAssassin::Masses;
+use Getopt::Long qw(:config bundling auto_help);
+use Pod::Usage;
+use strict;
+use warnings;
+
 use vars qw {
-  $opt_h $opt_l $opt_L
+$opt_c $opt_s $opt_l $opt_L $opt_inclang
 };
 
-sub usage {
-  die "find-extremes [-l LC] [-L LC] [spam log] [nonspam log]
+GetOptions("c|cffile=s@" => \$opt_c,
+           "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
+           "l|logfile=s" => \$opt_l,
+           "L|language=s" => \$opt_L,
+           "include-language=s" => \$opt_inclang);
 
-    -l LC  also print language specific rules for lang code LC (or 'all')
-    -L LC  only print language specific rules for lang code LC (or 'all')
 
-    options -l and -L are mutually exclusive.
 
-    if either the spam or and nonspam logs are unspecified, the defaults
-    are \"spam.log\" and \"nonspam.log\" in the cwd.
+my $lower = 1;
+#$threshold = 5;
+my $higher = 9;
+my $min_expected = 2; # Should not be set to more than 5 or less than 2
 
-";
+
+=head1 NAME
+ 
+find-extremes - Determine which rules are most likely to cause false positives/negatives.
+ 
+=head1 SYNOPSIS
+ 
+hit-frequencies [options]
+ 
+ Options:
+    -c,--cffile=path      Use path as the rules directory
+    -s,--scoreset=n       Use scoreset n
+    -l,--logfile=file     Read in file instead of masses.log
+    -L,--language=lc      Only print language specific tests for specified lang code (try 'all')
+    --include-language=lc Also print language specific tests for specified lang code (try 'all')
+ 
+=head1 DESCRIPTION
+
+B<hit-frequencies> will read the mass-check log F<masses.log> or the
+log given by the B<--logfile> option. By default, B<hit-frequencies>
+will assume the proper values for B<--cffile> based on the header of
+the masses.log. The output will include the following columns:
+
+=over 4
+
+=item RULE
+
+=item CHISQUARE
+
+=item RATIO_FALSEPOS
+
+=item OVER_FALSEPOS
+
+=item FREQ_OVER
+
+=back
+
+=head1 BUGS
+
+This script may or may not work as designed - it probably needs some
+tweaking, and I probably introduced a bug into it while re-writing for
+the new Masses stuff. 
+
+=head1 NOTES
+
+This script is poorly documented. Patches welcome.
+
+=cut
+
+
+$opt_s = 0 unless defined $opt_s;
+
+my $ok_lang = lc ( $opt_inclang || $opt_L || '');
+$ok_lang = '.' if ($ok_lang eq 'all');
+
+my $greprules = sub {
+  my ($name, $rule) = @_;
+
+  return 0 if (($opt_L && !$rule->{lang}) ||
+           ($rule->{lang} &&
+            (!$ok_lang || $rule->{lang} !~ /^$ok_lang/i))); # Wrong language
+
+  return 0 if ($rule->{tflags} =~ /\bnet\b/);
+
+  return 1;
+
+};
+
+$opt_l ||= "masses.log";
+
+if (!$opt_c || !scalar(@$opt_c)) {
+    # Try to read this in from the log, if possible
+    open (IN, $opt_l) or die "Can't open $opt_l: $!";
+    my $files = 0; # are we in the files section?
+    while(<IN>) {
+        if (!$files) {
+            if (/^\# SVN revision:/) {
+                $opt_c = [ "$FindBin::Bin/../rules" ];
+                last;
+            } elsif (/^\# Using configuration:$/) {
+                $files = 1;
+            }
+        } elsif (/^\#\s+(.*)\s*$/) {
+            push (@$opt_c, $1);
+        } else {
+            # All done!
+            last;
+        }
+    }
+
+    foreach my $file (@$opt_c) {
+        die "Can't read $file" unless -r $file;
+    }
 }
 
-usage() if($opt_h || ($opt_l && $opt_L));
+my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
+                                               scoreset => $opt_s,
+                                               greprules => $greprules,
+                                               logfile => $opt_l,
+                                               nologs => 1});
 
-$lower = 1;
-#$threshold = 5;
-$higher = 9;
-$min_expected = 2; # Should not be set to more than 5 or less than 2
+$masses->readrules();
+$masses->readlogs();
 
-my %freq_spam = ();	# how often non-nice found in spam
+my $rules = $masses->get_rules_hash();
+my $logs = $masses->get_logs();
+
+my $num_spam = $masses->get_num_spam();
+my $num_ham = $masses->get_num_ham();
+
 my %freq_over_higher_falsepos = (); # how often non-nice found in ones over
                                     # higher threshold that are false positives
-my %freq_nonspam = ();	# how often nice found in nonspam
 my %freq_under_lower_falseneg = (); # how often nice found in ones under
                                     # lower threshold that are false negatives
 
@@ -59,43 +165,54 @@
 my %ratio_expected_falsepos = (); # ratio version of above
 my %ratio_expected_falseneg = (); # ditto
 
-my $num_spam = 0;
-my $num_nonspam = 0;
 my $num_over_higher_falsepos = 0;
 my $num_under_lower_falseneg = 0;
-my $ok_lang = '';
 
-readscores();
+my %chisquare = ( );
+my %prob = ( );
 
-$ok_lang = lc ($opt_l || $opt_L || '');
-if ($ok_lang eq 'all') { $ok_lang = '.'; }
 
-foreach my $key (keys %rules) {
+foreach my $key (keys %$rules) {
 
-  if ( ($opt_L && !$rules{$key}->{lang}) ||
-       ($rules{$key}->{lang} &&
-         (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i)
-     ) ) {
-    delete $rules{$key} ; next;
-  }
-
-  if ($rules{$key}->{tflags} =~ m/net/) {
-    delete $rules{$key};
-    next;
-  }
-  if ($rules{$key}->{tflags} !~ m/userconf/) {
-    if ($rules{$key}->{tflags} =~ m/nice/) {
-      $freq_nonspam{$key} = 0;
+  if ($rules->{$key}->{tflags} !~ /\buserconf\b/) {
+    if ($rules->{$key}->{tflags} =~ m/nice/) {
       $freq_under_lower_falseneg{$key} = 0;
     } else {
-      $freq_spam{$key} = 0;
       $freq_over_higher_falsepos{$key} = 0;
     }
   }
+
 }
 
-readlogs();
+foreach my $log (@$logs) {
 
+  if($log->{isspam}) {
+    # Also need to count plus_hits
+    my $plus_hits = 0;
+    foreach my $test (@{$log->{tests_hit}}) {
+      $plus_hits += $test->{score} if ($test->{score} > 0);
+    }
+
+    if(($log->{score} <= $lower) && $plus_hits && $plus_hits >= $lower) {
+      $num_under_lower_falseneg++;
+      foreach my $test (@{$log->{tests_hit}}) {
+	$num_under_lower_falseneg++;
+	$freq_under_lower_falseneg{$test->{name}}++ if exists $freq_under_lower_falseneg{$test->{name}};
+      }
+    }
+  }
+  else {
+    if($log->{score} > $higher) {
+      $num_over_higher_falsepos++;
+      foreach my $test (@{$log->{tests_hit}}) {
+	$num_over_higher_falsepos++;
+	$freq_over_higher_falsepos{$test->{name}}++ if exists $freq_over_higher_falsepos{$test->{name}};
+      }
+    }
+  }
+
+}
+
 unless (($num_over_higher_falsepos >= $min_expected)
 	&& ($num_under_lower_falseneg >= $min_expected)) {
   die "Insufficient extremes in dataset (" . $num_over_higher_falsepos .
@@ -119,12 +236,13 @@
 }
 
 my $ratio_falsepos = $num_over_higher_falsepos/$num_spam;
-my $ratio_falseneg = $num_under_lower_falseneg/$num_nonspam;
+my $ratio_falseneg = $num_under_lower_falseneg/$num_ham;
 
 my $skipped_non_nice = 0;
 
-foreach $rule (keys %freq_spam) {
-  my $expected = $freq_spam{$rule}*$ratio_falsepos;
+# non-nice rules
+foreach my $rule (keys %freq_over_higher_falsepos) {
+  my $expected = $rules->{$rule}->{freq_spam}*$ratio_falsepos;
   if ($expected <= $min_expected) {
     $skipped_non_nice++;
     next;
@@ -136,7 +254,7 @@
    $freq_over_higher_falsepos{$rule}/$expected;
   ($chisquare{$rule},$prob{$rule}) =
    chisquare($num_spam,$num_over_higher_falsepos,
-	     $freq_spam{$rule},$freq_over_higher_falsepos{$rule});
+	     $rules->{$rule}->{freq_spam},$freq_over_higher_falsepos{$rule});
   if ($freq_over_higher_falsepos{$rule} < $expected) {
     $chisquare{$rule} *= -1;
   }
@@ -146,8 +264,9 @@
 
 my $skipped_nice = 0;
 
-foreach $rule (keys %freq_nonspam) {
-  my $expected = $freq_nonspam{$rule}*$ratio_falseneg;
+# nice rules
+foreach my $rule (keys %freq_under_lower_falseneg) {
+  my $expected = $rules->{$rule}->{freq_ham}*$ratio_falseneg;
   if ($expected <= $min_expected) {
     $skipped_nice++;
     next;
@@ -158,8 +277,8 @@
   $ratio_expected_falseneg{$rule} =
    $freq_under_lower_falseneg{$rule}/$expected;
   ($chisquare{$rule},$prob{$rule}) =
-   chisquare($num_nonspam,$num_under_lower_falseneg,
-	     $freq_nonspam{$rule},$freq_under_lower_falseneg{$rule});
+   chisquare($num_ham,$num_under_lower_falseneg,
+	     $rules->{$rule}->{freq_ham},$freq_under_lower_falseneg{$rule});
   if ($freq_under_lower_falseneg{$rule} < $expected) {
     $chisquare{$rule} *= -1;
   }
@@ -167,8 +286,12 @@
 
 warn "Skipped nice: $skipped_nice\n";
 
-@rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos);
+# The rest is copied verbatim from before - its complicated and not
+# commented and should work unchanged except for the freq_spam and
+# freq_ham stuff and fixing some use strict stuff
 
+my @rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos);
+
 if (scalar(@rules_falsepos)) {
   print "RULE\t\tCHISQUARE\tRATIO_FALSEPOS\tOVER_FALSEPOS\tFREQ_OVER ($num_over_higher_falsepos)\n";
   my(@rules_falsepos_bad) =
@@ -183,7 +306,7 @@
 	   $over_expected_falsepos{$a}) ||
 	    ($freq_over_higher_falsepos{$b} <=>
 	     $freq_over_higher_falsepos{$a})} (@rules_falsepos_bad);
-    foreach $rule (@rules_falsepos_bad) {
+    foreach my $rule (@rules_falsepos_bad) {
       print $rule . "\t" . $prob{$rule} . "\t" .
        $ratio_expected_falsepos{$rule} . "\t" .
 	$over_expected_falsepos{$rule} . "\t" .
@@ -199,9 +322,9 @@
        ($chisquare{$a} <=> $chisquare{$b}) ||
 	($ratio_expected_falsepos{$a} <=>
 	 $ratio_expected_falsepos{$b}) ||
-	  ($freq_spam{$b} <=>
-	   $freq_spam{$a})} (@rules_falsepos_good);
-    foreach $rule (@rules_falsepos_good) {
+	  ($rules->{$b}->{freq_spam} <=>
+	   $rules->{$a}->{freq_spam})} (@rules_falsepos_good);
+    foreach my $rule (@rules_falsepos_good) {
       print $rule . "\t" . $prob{$rule} . "\t" .
        $ratio_expected_falsepos{$rule} . "\t" .
 	$over_expected_falsepos{$rule} . "\t" .
@@ -212,7 +335,7 @@
   warn "No over-falsepos to print\n";
 }
 
-@rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg);
+my @rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg);
 
 if (scalar(@rules_falseneg)) {
   print "RULE\t\tCHISQUARE\tRATIO_FALSENEG\tOVER_FALSENEG\tFREQ_UNDER ($num_under_lower_falseneg)\n";
@@ -228,7 +351,7 @@
 	   $over_expected_falseneg{$a}) ||
 	    ($freq_under_lower_falseneg{$b} <=>
 	     $freq_under_lower_falseneg{$a})} (@rules_falseneg_bad);
-    foreach $rule (@rules_falseneg_bad) {
+    foreach my $rule (@rules_falseneg_bad) {
       print $rule . "\t" . $prob{$rule} . "\t" .
        $ratio_expected_falseneg{$rule} . "\t" .
 	$over_expected_falseneg{$rule} . "\t" .
@@ -244,9 +367,9 @@
        ($chisquare{$a} <=> $chisquare{$b}) ||
 	($ratio_expected_falseneg{$a} <=>
 	 $ratio_expected_falseneg{$b}) ||
-	  ($freq_spam{$b} <=>
-	   $freq_spam{$a})} (@rules_falseneg_good);
-    foreach $rule (@rules_falseneg_good) {
+	  ($rules->{$b}->{freq_ham} <=>
+	   $rules->{$a}->{freq_ham})} (@rules_falseneg_good);
+    foreach my $rule (@rules_falseneg_good) {
       print $rule . "\t" . $prob{$rule} . "\t" .
        $ratio_expected_falseneg{$rule} . "\t" .
 	$over_expected_falseneg{$rule} . "\t" .
@@ -258,97 +381,3 @@
 }
 
 exit;
-
-sub readlogs {
-  my $spam = $ARGV[0] || "spam.log";
-  my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log");
-
-
-  (open(NONSPAM,$nonspam)) ||
-   (die "Couldn't open file '$nonspam': $!; stopped");
-
-  while (defined($line = <NONSPAM>)) {
-    if ($line =~ m/^\s*\#/) {
-      next;
-    } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) {
-      my $tests = $1;
-      my $hits = 0;
-      my(@tests) = ();
-      foreach $test (grep {length($_)} (split(/,+/,$tests))) {
-	if (exists($rules{$test})) {
-	  push @tests, $test;
-	  $hits += $rules{$test}->{score};
-	}
-      }
-      
-      if (scalar(@tests)) {
-	$num_nonspam++;
-	foreach $test (grep {exists($freq_nonspam{$_})} (@tests)) {
-	  $freq_nonspam{$test}++;
-	}
-	if ($hits >= $higher) {
-	  $num_over_higher_falsepos++;
-	  foreach $test (grep
-			 {exists($freq_over_higher_falsepos{$_})} (@tests)) {
-	    $freq_over_higher_falsepos{$test}++;
-	  }
-	}
-      }
-    } elsif ($line =~ m/\S/) {
-      chomp($line);
-      warn "Can't interpret line '$line'; skipping";
-    }
-  }
-
-  close(NONSPAM);
-
-  (open(SPAM,$spam)) || (die "Couldn't open file '$spam': $!; stopped");
-
-  while (defined($line = <SPAM>)) {
-    if ($line =~ m/^\s*\#/) {
-      next;
-    } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) {
-      my $tests = $1;
-      my $hits = 0;
-      my $plus_hits = 0;
-      my(@tests) = ();
-      foreach $test (grep {length($_)} (split(/,+/,$tests))) {
-	if (exists($rules{$test})) {
-	  push @tests, $test;
-	  $hits += $rules{$test}->{score};
-	  if ($rules{$test}->{score} > 0) {
-	    $plus_hits += $rules{$test}->{score};
-	  }
-	}
-      }
-      
-      if (scalar(@tests)) {
-	$num_spam++;
-	foreach $test (grep {exists($freq_spam{$_})} (@tests)) {
-	  $freq_spam{$test}++;
-	}
-	if (($hits <= $lower) && $plus_hits &&
-	    ($plus_hits >= $lower)) {
-	  $num_under_lower_falseneg++;
-	  foreach $test (grep
-			 {exists($freq_under_lower_falseneg{$_})} (@tests)) {
-	    $freq_under_lower_falseneg{$test}++;
-	  }
-	}
-      }
-    } elsif ($line =~ m/\S/) {
-      chomp($line);
-      warn "Can't interpret line '$line'; skipping";
-    }
-  }
-
-  close(SPAM);
-}
-
-
-sub readscores {
-  system ("./parse-rules-for-masses") and
-   die "Couldn't do parse-rules-for-masses: $?; stopped";
-  require "./tmp/rules.pl";
-}
-