Attachment #1967 for bug #2853

View | Details | Raw Unified | Return to bug 2853
Collapse All | Expand All





    # read a file called "init.pre" in site rules dir *before* all others;
    # even the system config.

    # Save this in $self so that it can be accessed externally (for logging, etc.)
    $self->{site_rules_filename} ||= $self->first_existing_path (@site_rules_path);
    my $siterules = $self->{site_rules_filename};
    $siterules ||= $self->first_existing_path (@site_rules_path);

    $self->{rules_filename} ||= $self->first_existing_path (@default_rules_path);
    my $sysrules = $self->{rules_filename};
    $sysrules ||= $self->first_existing_path (@default_rules_path);

    if ($siterules) {
      $fname = File::Spec->catfile ($siterules, "init.pre");

      $self->get_and_create_userstate_dir();

      # user prefs file
      $self->{userprefs_filename} ||= $self->first_existing_path (@default_userprefs_path);
      $fname = $self->{userprefs_filename};
      $fname ||= $self->first_existing_path (@default_userprefs_path);

      if (defined $fname) {
        if (!-f $fname && !$self->{dont_copy_prefs} && !$self->create_default_prefs($fname)) {




# <@LICENSE>
# Copyright 2004 Apache Software Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

=head1 NAME

Mail::SpamAssassin::Masses - Interface for reading and parsing rules
and mass-check logs for SpamAssassin

=head1 SYNOPSIS

  my $parser = Mail::SpamAssassin::Masses->new();
  my $rules = $parser->readrules();
  my $logs = $parser->readlogs();

  foreach my $test (keys %$rules) {
    if ($rules->{$test}->{score} > 1) {
      ...
    }

=head1 DESCRIPTION

Mail::SpamAssassin::Masses is a module to simplify the many scripts
that used to make up the SpamAssassin re-scoring process. By
consolidating all the shared code in one module, the scripts can be
simplified and require fewer temporary files.

=head1 METHODS

=over 4

=cut

package Mail::SpamAssassin::Masses;

use strict;
use warnings;
use Carp;

=item $parser = Mail::SpamAssassin::Masses->new( [ { opt => val, ... } ] );

Construct a new Mail::SpamAssassin::Masses object. You may pass the
following attribute-value pairs to the constructor.

=over 4

=item rulesdir

The directory containing rules. If multiple directories are desired,
an anonymous array should be passed.

=item scoreset

Scoreset to deal with.

=item logfile

Filename of mass-check log.

=item falses

Also count frequencies for false positives and false negatives from
the logs.

=item falsesonly

Only count false positives and false negatives.

=item greprule

Coderef that is passed a rule name and a hash ref with the entries
containing info about the rule. If the sub returns false, it is skipped.

=item greplog

Coderef that is passed a raw log entry. If it returns false, the entry
is skipped.

=item sliding_window

Use a sliding window for score ranges rather than a shrinking window.

=item nologs

Save memory by not saving the individual log results, just the
aggregate totals

=back

=cut

sub new {

  my $class = shift;
  $class = ref($class) || $class;

  my $self = shift;
  if (!defined $self){
    $self = { };
  }

  $self->{scoreset} ||= 0;
  $self->{rulesdir} ||= '';
  $self->{logfile} ||= "masses.log";

  bless($self, $class);

  return $self;

}

=item $parser->readrules()

Read and parse the rules from the directory specified as
C<rulesdir>. This loads the following keys and values into the hash
entry representing the rules (see below).

=over 4

=item name

Contains the rule's name.

=item score

Contains the rule's score.

=item type

Contains the rule's type (header, body, uri, etc.)

=item tflags

Contains the rules tflags (nice, autolearn, etc.) as specified in the config file.

=item lang

Set to the value of C<lang> for language-specific tests.

=item issubrule

Set to true if the rules is a sub-rule, (i.e. it starts with
__). Otherwise, undefined.

=item isnice

This key exists and is true if the rule is nice (i.e. with a score
that can be below zero).

=item describe

Set to the rule's description, in English, or in the rule's language.

=back

There may be more values once C<readlogs()> is run.

=cut


sub readrules {

  my $self = shift;

  $self->{rules} ||= { };
  my $rules = $self->{rules}; # $rules is a reference to the anon hash

  my @dirs = ref($self->{rulesdir}) ? @{$self->{rulesdir}} : $self->{rulesdir};

  my @files;

  foreach my $indir (@dirs) {
    if (-d $indir) {
      @files = glob("$indir/*.cf"); # no reason to only do numbered files
    } else {
      @files = ( $indir );
    }

    foreach my $file (@files) {
      open (IN, "<$file") || croak("Can't open $file, $!");
      while(<IN>) {
        s/#.*$//g;
        s/^\s+//;
        s/\s+$//;
        next if /^$/;

        my $lang = '';
        if (s/^lang\s+(\S+)\s+//) {
          $lang = lc $1;
        }

        if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) {
          my $type = $1;
          my $name = $2;

          $rules->{$name} ||= { };
	  $rules->{$name}->{name} = $name;
          $rules->{$name}->{type} = $type;
          $rules->{$name}->{lang} = $lang if $lang;
          $rules->{$name}->{tflags} = '';

          if ($name =~ /^__/) {
	    $rules->{$name}->{issubrule} = '1';
	  }

        } elsif (/^describe\s+(\S+)\s+(.+)$/) {

          # Let's get description in english, por favor -- unless the rule isn't english

	  next if ($lang && (!$rules->{$1}->{lang} || $rules->{$1}->{lang} ne $lang));

          $rules->{$1} ||= { };
          $rules->{$1}->{describe} = $2;

        } elsif (/^tflags\s+(\S+)\s+(.+)$/) {
	  my $name = $1;
          $rules->{$name} ||= { };
          $rules->{$name}->{tflags} = $2;
	  if ($2 =~ /nice/) {
	    $rules->{$name}->{isnice} = 1;
	  }
        } elsif (/^score\s+(\S+)\s+(.+)$/) {
          my($name,$score) = ($1,$2);
          $rules->{$name} ||= { };
          if ( $score =~ /\s/ ) { # there are multiple scores
            ($score) = (split(/\s+/,$score))[$self->{scoreset}];
          }
          $rules->{$name}->{score} = $score;
        }
      }
      close IN;
    }
  }
  foreach my $rule (keys %{$rules}) {
    if (!defined $rules->{$rule}->{type}) {
      delete $rules->{$rule};   # no rule definition -> no rule
      next;
    }

    if (!defined $rules->{$rule}->{score}) {
      my $def = 1.0;
      if ($rule =~ /^T_/) { $def = 0.01; }

      if ($rules->{$rule}->{isnice}) {
        $rules->{$rule}->{score} = -$def;
      } else {
        $rules->{$rule}->{score} = $def;
      }
    }

    if ($self->{greprules} && !&{$self->{greprules}}($rule, $rules->{$rule}))
    {
      delete $rules->{$rule};
      next;
    }

  }

  $self->{_readrules} = 1;
}

=item $parser->readlogs()

Read and parse logs from C<logsdir>. This will create the anonymous
array of hashes referred to by C<$parser->{logs}>, with the following
keys:

=over 4

=item isspam

True if the message is spam. False or undefined otherwise.

=item isfalse

True if the message was a false negative or positive.

=item tests_hit

Array reference containing references to the hash representing each
rule hit.

=item score

Score the message received (under current scores).

=back

In addition, this method adds the following keys to the rule
information in C<$parser->{rules}>.

=over 4

=item freq_spam

Frequency hit in spam.

=item freq_ham

Frequency hit in ham.

=item freq_fp

Frequency in false positives.

=item freq_fn

Frequency in false negatives.

=back

Also, sets C<$parser->{num_spam}> and C<$parser->{num_ham}> to the number of
spam logs read and the number of ham logs read, respectively.

=cut

sub readlogs {

  my $self = shift;

  if (!$self->{_readrules}) {
    # need to read scores first!
    $self->readrules();
  }

  my $rules = $self->{rules}; # copy the ref, shorthand

  my $logs;
  if (! $self->{nologs}) {
    $self->{logs} ||= [ ];
    $logs = $self->{logs};
  }


  my ($num_spam, $num_ham, $count, $num_fp, $num_fn);
  $num_spam = $num_ham = $count = $num_fp = $num_fn = 0;

  # First, initialize stuff
  foreach my $rule (values %{$self->{rules}}) {
    $rule->{freq_spam} ||= 0;
    $rule->{freq_ham} ||= 0;

    if($self->{falses}) {
      $rule->{freq_fp} ||= 0;
      $rule->{freq_fn} ||= 0;
    }

  }

  my $file = $self->{logfile};
  open (IN, "<$file");

  while (<IN>) {
    next if /^\#/;
    next if /^$/;
    if($_ !~ /^(.)\s+(.)\s+-?[\d.]+\s+\S+(\s+\S+\s+)/) { warn "bad line: $_"; next; }

    if ($self->{greplogs} && !&{$self->{greplogs}}($_)) {
      next;
    }

    my $manual = $1;
    my $result = $2;
    $_ = $3;
    s/(?:bayes|time)=\S+//;
    s/,,+/,/g;
    s/^\s+//;
    s/\s+$//;


    if ($manual ne $result) {
      $self->{isfalse} = 1;
    }
    elsif ($self->{falsesonly}) {
      next;
    }

    if ($manual eq "s") {
      $num_spam++;
      $logs->[$count]->{isspam} = 1 unless $self->{nologs};
      $num_fn++ if $result eq "h";
    } else {
      $num_ham++;
      $num_fp++ if $result eq "s";
    }

    my @tests = ();
    my $score = 0;
    foreach my $tst (split (/,/, $_)) {
      next if ($tst eq '');

      # Don't count non-existant rules
      # (Could happen with greprules)
      next if ( !$rules->{$tst} || !$rules->{$tst}->{type} );

      if ($manual eq "s") {
	  $rules->{$tst}->{freq_spam}++;
	  $rules->{$tst}->{freq_fn}++ if ($self->{falses} && $result eq "h");
      }
      else {
	  $rules->{$tst}->{freq_ham}++;
	  $rules->{$tst}->{freq_fp}++ if ($self->{falses} && $result eq "s");
      }

      $score += $rules->{$tst}->{score};

      push (@tests, $rules->{$tst}) unless $self->{nologs};
    }

    $logs->[$count]->{tests_hit} = \@tests unless $self->{nologs};
    $logs->[$count]->{score} = $score;

    $count++;
  }
  close IN;

  $self->{num_spam} = $num_spam;
  $self->{num_ham} = $num_ham;
  if ($self->{falses}) {
    $self->{num_fn} = $num_fn;
    $self->{num_fp} = $num_fp;
  }

  $self->{_readlogs} = 1; # Done reading logs

}

=item $parser->do_statistics();

Calculate the S/O ratio and the rank for each test.

This adds the following keys to the rules hashes.

=over 4

=item spam_percent

Percentage of spam messages hit.

=item ham_percent

Percentage of ham messages hit.

=item soratio

S/O ratio -- percentage of spam messages hit divided by total
percentage of messages hit.

=back

=cut

sub do_statistics {
  my $self = shift;

  if (! $self->{_readlogs} ) {
    $self->readlogs();
  }

  my $rank_hi=0;
  my $rank_lo=999999;

  foreach my $rule (values %{$self->{rules}}) {

    if (!$rule->{freq_spam}) {
      $rule->{spam_percent} = 0;
    } else {
      $rule->{spam_percent} = $rule->{freq_spam} / $self->{num_spam} * 100.0;
    }

    if (!$rule->{freq_ham}) {
      $rule->{ham_percent} = 0;
    } else {
      $rule->{ham_percent} = $rule->{freq_ham} / $self->{num_ham} * 100.0;
    }

    if (!$rule->{freq_spam} && !$rule->{freq_ham}) {
      $rule->{soratio} = 0.5;
      next;
    }

    $rule->{soratio} = $rule->{spam_percent} / ($rule->{spam_percent} + $rule->{ham_percent});

  }

  $self->{_statistics} = 1;

}

=item $parser->do_rank();

Calculates the ranking for each rule and stores this in the
appropriate key.

=over 4

=item rank

"Rank" of the rule. High numbers are good, low are bad.

=back

=cut

sub do_rank {

  my $self = shift;

  if (! $self->{_statistics} ) {
    $self->do_statistics();
  }

  my $rank_hi = 0;
  my $rank_lo = 9999999;

  my %unwanted;
  my %wanted;
  my %wranks = ();
  my %uranks = ();
  my $rules = $self->{rules};


  foreach my $rule (values %{$self->{rules}}) {

    $wanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_ham} : $rule->{freq_spam};
    $unwanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_spam} : $rule->{freq_ham};

    $wranks{$wanted{$rule->{name}}} = 1;
    $uranks{$unwanted{$rule->{name}}} = 1;

  }

  my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
  my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;

  # first half of ranking is the wanted rank
  my $position = 0;
  my $last = undef;

  foreach my $test (@wanted) {
    $position++ if defined $last && $last != $wanted{$test};
    $rules->{$test}->{rank} += $position;
    $last = $wanted{$test};
  }

  # second half is the unwanted rank
  $position = 0;
  $last = undef;

  # Avoid divide by 0 errors!
  die "Error: no rules read" if (!(scalar keys %uranks));

  my $normalize = (scalar keys %wranks) / (scalar keys %uranks);

  foreach my $test (@unwanted) {
    $position++ if defined $last && $last != $unwanted{$test};
    $rules->{$test}->{rank} += ($position * $normalize);
    $last = $unwanted{$test};
    $rank_hi = $rules->{$test}->{rank} if ($rules->{$test}->{rank} > $rank_hi);
    $rank_lo = $rules->{$test}->{rank} if ($rules->{$test}->{rank} < $rank_lo);
  }

  $rank_hi = $rank_hi - $rank_lo;
  foreach my $rule (values %{$rules}) {
    $rule->{rank} = ($rank_hi == 0) ? 0.001 : (($rule->{rank} - $rank_lo)/ $rank_hi);
  }

  $self->{_rank} = 1;
}

=item $parser->get_rules_array();

Returns a reference to an array of hash references. The values of
these hash have keys as listed above.

=cut

sub get_rules_array {
  my $self = shift;
  return [ values %{$self->{rules}} ];
}

=item $parser->get_rules_hash();

Returns a reference to a hash with rule names as keys and hash
references as values. The values of these hash have keys as listed
above.

=cut

sub get_rules_hash {
  my $self = shift;
  return $self->{rules};
}

=item $parser->get_logs();

Returns a reference to the array containing log entries, in the form
of anonymous hashes with keys as described above.

=cut

sub get_logs {
  my $self = shift;
  return $self->{logs};
}

=item $parser->get_num_ham();

Returns number of ham logs read.

=cut

sub get_num_ham {
  my $self = shift;
  return $self->{num_ham};
}

=item $parser->get_num_spam();

Returns number of spam logs read.

=cut

sub get_num_spam {
  my $self = shift;
  return $self->{num_spam};
}

=item $parser->do_score_ranges();

Figure out range in which score can be set based on the soratio, etc.

This is necessary so that the perceptron doesn't set silly
scores. (This may not be as much of a problem as it was with the old
GA.)

This adds the following keys to the rules hashes:

=over 4

=item ismutable

Determines whether the perceptron can select a score for this test.

=item range_lo

Determines the lowest score the perceptron can set.

=item range_hi

Determines the highest score the perceptron can set.

=cut

sub do_score_ranges() {

  my $self = shift;

  if ( !$self->{_statistics} ) {
    $self->do_statistics();
  }
  if ( !$self->{_rank} ) {
    $self->do_rank();
  }

  foreach my $rule (values %{$self->{rules}}) {

    my ($rank, $lo, $hi);

    $rank = $rule->{rank};

    # Get rid of rules that don't hit -- and disable completely.
    if ($rule->{spam_percent} + $rule->{ham_percent} < 0.01 ||
	$rule->{score} == 0) {

      $rule->{ismutable} = 0;
      $rule->{range_lo} = $rule->{range_hi} = 0;
      next;

    }

    # next: get rid of tests that don't apply in this scoreset
    # or are userconf -- set ismutable to 0, but keep the score
    if ($rule->{tflags} =~ /\buserconf\b/ ||
	(($self->{scoreset} % 2) == 0 && $rule->{tflags} =~/\bnet\b/)) {

      $rule->{ismutable} = 0;
      $rule->{range_lo} = $rule->{range_hi} = $rule->{score};
      next;

    }


    # Normal rules:

    # This seems to convert from [-1,1] to [0,1] but we're already in
    # [0,1] space - Is this right?

    # The current way ranks are calculated, > 0.5 and < 0.5 have no
    # special meaning

#      # 0.0 = best nice, 1.0 = best nonnice
#      if ($rule->{isnice}) {
#        $rank = .5 - ($rank / 2);
#      } else {
#        $rank = .5 + ($rank / 2);
#      }

    # using this seems to work better

    if($rule->{isnice}) {
      $hi = 0;
      $lo = $rule->{rank} * -4.5;
    } else {
      $hi = $rule->{rank} * 4.5;
      $lo = 0
    }

     # Modify good rules to be lower
     if ($rule->{isnice}) {
       if ($rule->{tflags} =~ /\blearn\b/) { # learn rules should get
                                             # higher scores (-5.4)
 	$lo *= 1.8;
       }
       elsif ( $rule->{soratio} <= 0.05 && $rule->{ham_percent} > 0.5) {
 	$lo *= 1.5;
       }

       # argh, ugly... but i'm copying it whole...
       $hi =	($rule->{soratio} == 0) ? $lo :
     		($rule->{soratio} <= 0.005 ) ? $lo/1.1 :
     		($rule->{soratio} <= 0.010 && $rule->{ham_percent} > 0.2) ? $lo/2.0 :
 		($rule->{soratio} <= 0.025 && $rule->{ham_percent} > 1.5) ? $lo/10.0 :
 		0;

       if ($rule->{soratio} >= 0.35 ) {
 	($lo, $hi) = (0,0);
       }
     }
     else { # Make non-nice rules have higher scores if they're good
       if ($rule->{tflags} =~ /\blearn\b/ ) {
 	$hi *= 1.8;
       }
       elsif ( $rule->{soratio} >= 0.99 && $rule->{spam_percent} > 1.0) {
 	$hi *= 1.5;
       }

       $lo =	($rule->{soratio} == 1) ? $hi:
     		($rule->{soratio} >= 0.995 ) ? $hi/4.0 :
     		($rule->{soratio} >= 0.990 && $rule->{spam_percent} > 1.0) ? $hi/8.0 :
 		($rule->{soratio} >= 0.900 && $rule->{spam_percent} > 10.0) ? $hi/24.0 :
 		0;

       if ($rule->{soratio} <= 0.65 ) { # auto-disable bad rules
 	($lo, $hi) = (0,0);
       }
     }


    # Some sanity checking
    if($hi < $lo) {
      ($lo, $hi) = ($hi, $lo);
    }


    $rule->{ismutable} = ($lo == $hi) ? 0 : 1;
    $rule->{range_lo} = $lo;
    $rule->{range_hi} = $hi;

  }
}


# Pacify perl
1;
  - craig-evolve.scores
  + craig-evolve.scores




#!/usr/bin/perl
#
# <@LICENSE>
# Copyright 2004 Apache Software Foundation
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

sub usage {
  die "
parse-rules-for-masses: parse the SpamAssassin rules files for mass-checks,
        evolving, and frequency analysis

usage: ./parse-rules-for-masses [-d rulesdir] [-o outputfile] [-s scoreset]

rulesdir defaults to ../rules
outputfile defaults to ./tmp/rules.pl
scoreset default to 0

";
}

use Getopt::Long;
use Data::Dumper;

use vars qw(@rulesdirs $outputfile $scoreset);
GetOptions (
                "d=s" => \@rulesdirs,
                "o=s" => \$outputfile,
		"s=i" => \$scoreset,
                "help|h|?" => sub { usage(); } );

if ($#rulesdirs < 0) {
  @rulesdirs = ("../rules");
}

if (!defined $outputfile) {
  $outputfile = "./tmp/rules.pl";
  mkdir ("tmp", 0755);
}

$scoreset = 0 if ( !defined $scoreset );

my $rules = { };
readrules(@rulesdirs);

my $scores = { };
foreach my $key (keys %{$rules}) {
  $scores->{$key} = $rules->{$key}->{score};
}

writerules($outputfile);
exit;

sub readrules {
  foreach my $indir (@_) {
    my @files = <$indir/[0-9]*.cf>;
    my $file;
    %rulesfound = ();
    %langs = ();
    foreach $file (sort @files) {
      open (IN, "<$file");
      while (<IN>) {
        s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;

        my $lang = '';
        if (s/^lang\s+(\S+)\s+//) {
          $lang = $1;
        }

        if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) {
          my $type = $1;
          my $name = $2;

          my $issubrule = '0';
          if ($name =~ /^__/) { $issubrule = '1'; }

          $rules->{$1} ||= { };
          $rules->{$name}->{type} = $type;
          $rules->{$name}->{lang} = $lang;
          $rules->{$name}->{issubrule} = $issubrule;
          $rules->{$name}->{tflags} = '';

        } elsif (/^describe\s+(\S+)\s+(.+)$/) {
          $rules->{$1} ||= { };
          $rules->{$1}->{describe} = $2;

        } elsif (/^tflags\s+(\S+)\s+(.+)$/) {
          $rules->{$1} ||= { };
          $rules->{$1}->{tflags} = $2;

        } elsif (/^score\s+(\S+)\s+(.+)$/) {
	  my($name,$score) = ($1,$2);
          $rules->{$name} ||= { };
	  if ( $score =~ /\s/ ) { # there are multiple scores
	    ($score) = (split(/\s+/,$score))[$scoreset];
	  }
          $rules->{$name}->{score} = $score;
        }
      }
      close IN;
    }
  }

  foreach my $rule (keys %{$rules}) {
    if (!defined $rules->{$rule}->{type}) {
      delete $rules->{$rule};   # no rule definition -> no rule
      next;
    }

    if (!defined $rules->{$rule}->{score}) {
      my $def = 1.0;
      if ($rule =~ /^T_/) { $def = 0.01; }

      if ($rules->{$rule}->{tflags} =~ /nice/) {
        $rules->{$rule}->{score} = -$def;
      } else {
        $rules->{$rule}->{score} = $def;
      }
    }
  }
}

sub writerules {
  my $outfile = shift;
  # quick hack to create the tmp directory
  system ("mkdir -p $outfile 2>/dev/null ; rmdir $outfile 2>/dev/null");

  open (OUT, ">$outfile") or die "cannot write to $outfile";
  print OUT "# dumped at ".`date`."\n";

  $Data::Dumper::Purity = 1;
  print OUT Data::Dumper->Dump ([$rules, $scores], ['*rules', '*scores']);

  print OUT "1;";
  close OUT;
}





# limitations under the License.
# </@LICENSE>


use FindBin;
use lib "$FindBin::Bin/../lib";
use Mail::SpamAssassin::Masses;
use Getopt::Long qw(:config bundling auto_help);
use Pod::Usage;
use strict;
use warnings;


use vars qw {
  $opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c
  $opt_a $opt_t $opt_s $opt_z $opt_inclang $opt_auto
};

GetOptions("c|cffile=s@" => \$opt_c,
	   "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
	   "l|logfile=s" => \$opt_l,
	   "f|falses" => \$opt_f,
	   "a|all" => \$opt_a,
	   "p|percentages" => \$opt_p,
	   "x|extended" => \$opt_x,
	   "m|matchrule=s" => \$opt_m, #,
	   "t|tflags=s" => \$opt_t,
	   "M|matchlog=s" => \$opt_M,
	   "X|excludelog=s" => \$opt_X,
	   "L|language=s" => \$opt_L,
	   "include-language=s" => \$opt_inclang);

    -c p   use p as the rules directory
    -f     falses. count only false-negative or false-positive matches
    -m RE  print rules matching regular expression
    -t RE  print rules with tflags matching regular expression
    -M RE  only consider log entries matching regular expression
    -X RE  don't consider log entries matching regular expression
    -l LC  also print language specific rules for lang code LC (or 'all')
    -L LC  only print language specific rules for lang code LC (or 'all')
    -a     display all tests
    -p     percentages. implies -x
    -x     extended output, with S/O ratio and scores
    -s SC  which scoreset to use
    -i     use IG (information gain) for ranking

=head1 NAME

hit-frequencies - Display statistics about tests hit by a mass-check run

=head1 SYNOPSIS
    are \"spam.log\" and \"ham.log\" in the cwd.

hit-frequencies [options]
}

 Options:
    -c,--cffile=path	  Use path as the rules directory
    -s,--scoreset=n	  Use scoreset n
    -l,--logfile=file	  Read in file instead of masses.log
    -f			  Count only false-positives/false-negatives
    -a			  Report all tests (including subrules)
    -p			  Report percentages instead of raw hits
    -x			  "Extended" output, include RANK, S/O and SCORE
    -m,--matchrule=re     Print rules matching the regular expression
    -t,--tflags=re	  Print only rules with tflags matching the regular expression
    -M,--matchlog=re      Consider only logs matching the regular expression
    -X,--excludelog=re	  Exclude logs matching this regular expression
    -L,--language=lc	  Only print language specific tests for specified lang code (try 'all')
    --include-language=lc Also print language specific tests for specified lang code (try 'all')

=head1 DESCRIPTION
  $opt_x = 1;
}

B<hit-frequencies> will read the mass-check log F<masses.log> or the
log given by the B<--logfile> option. The output will contain a
summary of the number of ham and spam messages and detailed statistics
for each rule. By default, B<hit-frequencies> will try to guess the
proper values for B<--cffile> based on the header of the
masses.log. The output will include the following columns:

=over 4

=item OVERALL
my %freq_ham = ();
my $num_spam = 0;
my $num_ham = 0;
my %ranking = ();
my $ok_lang = '';

Number of times (or percentage with B<-p>) the rule hit on
all messages (spam or ham).

=item SPAM
if ($ok_lang eq 'all') { $ok_lang = '.'; }

Number of times (or percentage with B<-p>) the rule hit on
spam messages.

=item HAM
       ($rules{$key}->{lang} &&
         (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i)
     ) ) {
    delete $rules{$key} ; next;
  }

Number of times (or percentage with B<-p>) the rule hit on
ham messages.
}

=item S/O

Shown only with B<-x> or B<-p>, this is the number of spam hits
divided by total number of hits (C<S/O> refers to spam divided by
overall).

=item RANK
  my $sorting = $opt_i ? "IG" : "RANK";
  if ($opt_f) {
    printf "%7s %7s %7s  %6s  %6s  %6s  %s\n",
  	"OVERALL%", "FNEG%", "FPOS%", "S/O", $sorting, "SCORE", "NAME";
  } else {
    printf "%7s %7s  %7s  %6s  %6s  %6s  %s\n",
  	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
  }
  printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
  	$hdr_all, $hdr_spam, $hdr_ham,
        soratio ($num_spam,$num_ham), 0, 0;

Shown only with B<-x> or B<-p>, this is a measure that attempts to
indicate how I<good> or I<useful> a test is. The higher it is, the
better the test.
  printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  (all messages as %%)\n",
  	$hdr_all, $hdr_spam, $hdr_ham,
        soratio ($num_spam,$num_ham), 0, 0;

=item SCORE
  printf "%7s %7s  %7s  %6s  %6s %6s  %s\n",
  	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
  printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  (all messages)\n",
  	$hdr_all, $hdr_spam, $hdr_ham,
        soratio ($num_spam,$num_ham), 0, 0;

Shown only with B<-x> or B<-p>, this is the current score assigned to
the rule.
  	"OVERALL", "SPAM", "HAM", "NAME";
  printf "%10d  %10d  %10d  (all messages)\n",
  	$hdr_all, $hdr_spam, $hdr_ham;
}

=item NAME
my @tests = ();
my $rank_hi = 0;
my $rank_lo = 9999999;

This is the rule's name.
my %wanted;
my %unwanted;
my %wranks;
my %uranks;

=back
  next unless (exists $rules{$test});           # only valid tests
  next if (!$opt_a && $rules{$test}->{issubrule});

=head1 BUGS
  push (@tests, $test);

Please report bugs to http://bugzilla.spamassassin.org/
  if ($rules{$test}->{tflags} =~ /nice/) { $isnice = 1; }

=head1 SEE ALSO
  my $fn = $freq_ham{$test}; $fn ||= 0;
  my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
  my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;

L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>

=cut
    $soratio = 1.0 - $soratio;
    my $tmp = $fsadj; $fsadj = $fnadj; $fnadj = $tmp;
  }

if ($opt_L && $opt_inclang) {
  pod2usage("-L/--language and --include-language are mutually exclusive");
    my $rank;

    # New new system: from "Learning to Filter Unsolicited Commercial E-Mail",
    # Ion Androutsopoulos et al: determine the information gain IG(X, C) of the
    # Boolean attributes (ie. the rules). Measures "the average reduction in
    # the entropy of C (classification) given the value of X (the rule)". Makes
    # a good ranking measure with a proper statistical basis. ;)
    #
    # Still would like to get an entropy measure in, too.
    #
    #             sum                                    P(X = x ^ C = c)
    # IG(X,C) = x in [0, 1]    P(X = x ^ C = c) . log2( ------------------- )
    #           c in [Ch, Cs]                           P(X = x) . P(C = c)
    #
    my $safe_nspam = $num_spam || 0.0000001;
    my $safe_nham = $num_ham || 0.0000001;

    my $num_all = ($num_spam + $num_ham);
    my $safe_all = $num_all || 0.0000001;
    my $f_all = $fs+$fn;

    my $px0 = (($num_all - $f_all) / $safe_all);         # P(X = 0)
    my $px1 = ($f_all / $safe_all);                      # P(X = 1)
    my $pccs = ($num_spam / $safe_all);                  # P(C = Cs)
    my $pcch = ($num_ham / $safe_all);                   # P(C = Ch)
    my $px1ccs = ($fs / $safe_nspam);                   # P(X = 1 ^ C = Cs)
    my $px1cch = ($fn / $safe_nham);                    # P(X = 1 ^ C = Ch)
    my $px0ccs = (($num_spam - $fs) / $safe_nspam);     # P(X = 0 ^ C = Cs)
    my $px0cch = (($num_ham - $fn) / $safe_nham);       # P(X = 0 ^ C = Ch)
    my $safe_px0_dot_pccs = ($px0 * $pccs) || 0.00000001;
    my $safe_px0_dot_pcch = ($px0 * $pcch) || 0.00000001;
    my $safe_px1_dot_pccs = ($px1 * $pccs) || 0.00000001;
    my $safe_px1_dot_pcch = ($px1 * $pcch) || 0.00000001;

    sub log2 { return log($_[0]) / 0.693147180559945; } # log(2) = 0.6931...

    my $safe_px0ccs = ($px0ccs || 0.0000001);
    my $safe_px0cch = ($px0cch || 0.0000001);
    my $safe_px1ccs = ($px1ccs || 0.0000001);
    my $safe_px1cch = ($px1cch || 0.0000001);
    $rank = ( $px0ccs * log2($safe_px0ccs / $safe_px0_dot_pccs) ) +
                    ( $px0cch * log2($safe_px0cch / $safe_px0_dot_pcch) ) +
                    ( $px1ccs * log2($safe_px1ccs / $safe_px1_dot_pccs) ) +
                    ( $px1cch * log2($safe_px1cch / $safe_px1_dot_pcch) );

    $ranking{$test} = $rank;
    $rank_hi = $rank if ($rank > $rank_hi);
    $rank_lo = $rank if ($rank < $rank_lo);
  }
  else {
    # basic wanted/unwanted ranking
    $wanted{$test} = $isnice ? $fn : $fs;
    $unwanted{$test} = $isnice ? $fs : $fn;
    # count number of ranks of each type
    $wranks{$wanted{$test}} = 1;
    $uranks{$unwanted{$test}} = 1;
  }
}

if ($opt_p) {
  $opt_x = 1;
  my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
  my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;

  # first half of ranking is the wanted rank
  my $position = 0;
  my $last = undef;
  for my $test (@wanted) {
    $position++ if defined $last && $last != $wanted{$test};
    $ranking{$test} += $position;
    $last = $wanted{$test}
  }

  # second half of ranking is the unwanted rank
  my $normalize = (scalar keys %wranks) / (scalar keys %uranks);
  $position = 0;
  $last = undef;
  for my $test (@unwanted) {
    $position++ if defined $last && $last != $unwanted{$test};
    $ranking{$test} += ($position * $normalize);
    $last = $unwanted{$test};
    $rank_hi = $ranking{$test} if ($ranking{$test} > $rank_hi);
    $rank_lo = $ranking{$test} if ($ranking{$test} < $rank_lo);
  }
}

$opt_s = 0 if ( !defined $opt_s );
  # now normalise the rankings to [0, 1]
  $rank_hi -= $rank_lo;
  foreach $test (@tests) {
    $ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi);
  }
}

my $ok_lang = lc ( $opt_inclang || $opt_L || '');
$ok_lang = '.' if ($ok_lang eq 'all');
  next if (!$opt_a && $rules{$test}->{issubrule});

my $greprules = sub { # To determine whether rule should be read
  my ($name, $rule) = @_;
  my $fa = $fs+$fn;

  return 0 if ($opt_m && $name !~ /$opt_m/); # name doesn't match -m
                                             # expression
  return 0 if ($opt_t && $rule->{tflags} !~ /$opt_t/); # tflags don't
                                                       # match -t
                                                       # expression
  return 0 if (($opt_L && !$rule->{lang}) ||
	   ($rule->{lang} &&
	    (!$ok_lang || $rule->{lang} !~ /^$ok_lang/i))); # Wrong language

  return 0 if ($rule->{issubrule} && !$opt_a);

  if (!$opt_a && !$opt_t) {
    return 0 if ($rule->{tflags} =~ /net/ && ($opt_s % 2 == 0));
    return 0 if ($rule->{tflags} =~ /userconf/); # or userconf
  }
  return 1;

};
  my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
  my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;

  if ($opt_f && $fsadj == 0 && $fnadj == 0) { next; }

my $logfile = $opt_l || "masses.log";
    $fa = ($fa / ($num_spam + $num_ham)) * 100.0;
    $fs = $fsadj;
    $fn = $fnadj;
  }

if (!$opt_c || !scalar(@$opt_c)) {
    # Try to read this in from the log, if possible
    open IN, $logfile or die "Can't open $logfile: $!";
    my $files = 0; # are we in the files section?
    while(<IN>) {
	if (!$files) {
	    if (/^\# SVN revision:/) {
		$opt_c = [ "$FindBin::Bin/../rules" ];
		last;
	    } elsif (/^\# Using configuration:$/) {
		$files = 1;
	    }
	} elsif (/^\#\s+(.*)\s*$/) {
	    push (@$opt_c, $1);
	} else {
	    # All done!
	    last;
	}
    }

    if (!defined $opt_c) {
      $opt_c = [ "$FindBin::Bin/../rules" ];
    }

    foreach my $file (@$opt_c) {
	die "Can't read $file" unless -r $file;
    }

  } else {
    printf "%10d  %10d  %10d  %s\n", $fa, $fs, $fn, $test;
  }
}
	    
my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
					       scoreset => $opt_s,
                                               falsesonly => $opt_f,
                                               greprules => $greprules,
                                               logfile => $logfile,
                                               nologs => 1});

$masses->readrules();
$masses->readlogs();
$masses->do_statistics();
$masses->do_rank();

my $rules = $masses->get_rules_hash();
my $num_ham = $masses->get_num_ham();
my $num_spam = $masses->get_num_spam();
my $num_all = $num_ham + $num_spam;

if ($num_ham + $num_spam <= 0) {
  die "Can't run hit-frequencies on 0 messages.";
}

## Write header
    open (IN, "<$file") || die "Could not open file '$file': $!";

if ($opt_p) {

  if ($opt_f) {
    printf "%7s %7s %7s  %6s  %6s  %6s  %s\n",
  	"OVERALL%", "FNEG%", "FPOS%", "S/O", "RANK", "SCORE", "NAME";
  } else {
    printf "%7s %7s  %7s  %6s  %6s  %6s  %s\n",
  	"OVERALL%", "SPAM%", "HAM%", "S/O", "RANK", "SCORE", "NAME";
  }

  printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
  	$num_all, $num_spam, $num_ham,
        $num_spam / $num_all, 0, 0;
      $_ = $4; s/,,+/,/g;

  printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  (all messages as %%)\n",
  	100.0, $num_spam / $num_all * 100.0, $num_ham / $num_all * 100.0,
        $num_spam / $num_all, 0, 0;
        } else {
          $num_spam++;
        }
      } else {
        if ($opt_f) {
          if ($caught) { $num_ham++; }
        } else {
          $num_ham++;
        }
      }

} elsif ($opt_x) {
  printf "%7s %7s  %7s  %6s  %6s %6s  %s\n",
  	"OVERALL", "SPAM", "HAM", "S/O", "RANK", "SCORE", "NAME";
  printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  (all messages)\n",
  	$num_all, $num_spam, $num_ham,
        $num_spam / $num_all, 0, 0;
          } else {
            $freq_spam{$t}++;
          }
	} else {
          if ($opt_f) {
            if ($caught) { $freq_ham{$t}++; }
          } else {
            $freq_ham{$t}++;
          }
	}
      }
    }
    close IN;
  }
}

} else {
  printf "%10s  %10s  %10s  %s\n",
  	"OVERALL", "SPAM", "HAM", "NAME";
  printf "%10d  %10d  %10d  (all messages)\n",
  	$num_all, $num_spam, $num_ham;
}

foreach my $test (sort { $rules->{$b}->{rank} <=> $rules->{$a}->{rank} } keys %{$rules}) {
  my ($s, $n) = @_;

  if ($opt_p) {
    printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  %s\n",
  	($rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham}) / $num_all * 100.0,
        $rules->{$test}->{spam_percent}, $rules->{$test}->{ham_percent},
        $rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test;
  } elsif ($opt_x) {
    printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  %s\n",
  	$rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham},
        $rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham},
        $rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test;
  } else {
    printf "%10d  %10d  %10d  %s\n",
        $rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham},
        $rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham}, $test;
  }
}

sub tcr {
  my ($nspam, $nlegit, $nspamspam, $nlegitspam) = @_;
  my $nspamlegit = $nspam - $nspamspam;
  my $nlegitlegit = $nlegit - $nlegitspam;

  my $lambda = 99;

  my $werr = ($lambda * $nlegitspam + $nspamlegit)
                  / ($lambda * $nlegit + $nspam);

  my $werr_base = $nspam
                  / ($lambda * $nlegit + $nspam);

  $werr ||= 0.000001;     # avoid / by 0
  my $tcr = $werr_base / $werr;
  return $tcr;
}




=head1 NAME

perceptron - Generate scores for SpamAssassin using the "Stochastic
Gradient Method"

=head1 SYNOPSIS

perceptron [options]

 Options:
  -p ham_preference 	Modifies tendency to prefer false negatives over
			false positives (default 2.0) (higher = less fp)
  -e num_epochs		Set number of passes to make (default 15)
  -l learning_rate	Modifies learning rate (default 2.0)
  -w weight_decay 	Scores multiplied by this value after each pass
			to prevent scores from getting too high
			(default off (1.0))

=head1 DESCRIPTION

This algorithm is used to optimize SpamAssassin scores, based on the
input given by B<logs-to-c>. At the time of writing, the output of
logs-to-c needs to be compiled into the source before perceptron can
be used, but this will be fixed soon, I hope.

=head1 SEE ALSO

L<logs-to-c(1)>

=cut




# limitations under the License.
# </@LICENSE>

=head1 NAME

rewrite-cf-with-new-scores - Rewrite SpamAssassin scores file with new
scores.

=head1 SYNOPSIS

rewrite-cf-with-new-scores [options]

  Options
  --old-scores=file    Read file containing the old SpamAssassin scores
  --new-scores=file    Read file containing the new SpamAssassin scores
  -s,--scoreset n      Rewrite scoreset n
  --output=file        Output rewritten score file to file
  -c,--cffile=path     Use path as the rules directory
  -l,--logfile=file    Use file instead of masses.log (for guessing -c)

 Note: these options can be shortened (i.e. --old, --new, --out) as
 long as they are unambiguous.

=head1 DESCRIPTION

B<rewrite-cf-with-new-scores> is a tool to update the sitewide scores
file with the newly generated scores. Since SpamAssassin has four
different scoresets, which each need to be generated separately, this
tool is used to only change the correct scoreset.

By default, the old scores are read from 50_scores.cf in the rules
directory and the new ones from ./perceptron.scores. The output will
be ./50_scores.cf by default.

The rules directory needs to be used to make sure scores are given for
the right tests. Rules not found in the rules directory will not be
given scores in the output.

=head1 BUGS

Please report bugs to http://bugzilla.spamassassin.org/

=head1 SEE ALSO

L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>

=cut

use FindBin;
use lib "$FindBin::Bin/../lib";
use Getopt::Long qw(:config bundling auto_help);
use Mail::SpamAssassin::Masses;
use Pod::Usage;
use strict;
use warnings;

use vars qw($opt_old $opt_new $opt_scoreset $opt_out $opt_c $opt_l);

GetOptions("old-scores=s" => \$opt_old,
	   "new-scores=s" => \$opt_new,
	   "s|scoreset=i" => \$opt_scoreset,
	   "output=s" => \$opt_out,
	   "c|cffile=s@" => \$opt_c,
	   "l|logfile=s" => \$opt_l);

$opt_l ||= "masses.log";
$opt_scoreset = 0 unless defined $opt_scoreset;

my $NUM_SCORESETS = 4;

if (!$opt_c || !scalar(@$opt_c)) {
    # Try to read this in from the log, if possible
    open IN, $opt_l or die "Can't open $opt_l: $!";
    my $files = 0; # are we in the files section?
    while(<IN>) {
	if (!$files) {
	    if (/^\# SVN revision:/) {
		$opt_c = [ "$FindBin::Bin/../rules" ];
		last;
	    } elsif (/^\# Using configuration:$/) {
		$files = 1;
	    }
	} elsif (/^\#\s+(.*)\s*$/) {
	    push (@$opt_c, $1);
	} else {
	    # All done!
	    last;
	}
    }

    if (!defined $opt_c) {
      $opt_c = [ "$FindBin::Bin/../rules" ];
    }

    foreach my $file (@$opt_c) {
	die "Can't read $file" unless -r $file;
    }
}

if (!$opt_old) {
  $opt_old = $$opt_c[0] . "/50_scores.cf";
  # Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem
  require "./tmp/rules.pl";
}
else {
  die "parse-rules-for-masses had no error but no tmp/rules.pl!?!";
}

$opt_new ||= "50_scores.cf";

my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
					       scoreset => $opt_scoreset});

$masses->readrules();
my $rules = $masses->get_rules_hash();

# now read the generated scores
my @gascoreorder = ();
my %oldscores = ();
my %gascorelines = ();
open (STDIN, "<$opt_new") or die "cannot open $opt_new";
while (<STDIN>) {
  /^score\s+(\S+)\s+(-?\d+(?:\.\d+)?)/ or next;
  my $name = $1;  my $score = $2;
  next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule});
  next if ($name =~ /^__/);
  next if ($name eq '(null)');	# er, oops ;)


  push (@gascoreorder, $name);
}

open (IN, "<$opt_old") or die "cannot open $opt_old";
my $out = '';
my $pre = '';


while (<IN>) {
  if (/^\s*score\s+(\S+)\s/) {
    delete $gascorelines{$1};
    next unless (exists ($rules->{$1}) && $rules->{$1}->{issubrule} == 0);
  }
  $pre .= $_;
  /^# Start of generated scores/ and last;

  if (/^\s*score\s+\S+/) {
    my($score,$name,@scores) = split;

    next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule});
    if (defined $gascorelines{$name}) {
      # Set appropriate scoreset value
      $scores[$opt_scoreset] = $gascorelines{$name};

      # Create new score line
      $_ = join(" ","score",$name,generate_scores(@scores))."\n";

}
close IN;

open OUT, ">$opt_out" or die "Can't open $opt_out: $!";

# and output the lot
print OUT $pre, "\n";
foreach my $name (@gascoreorder) {
  $_ = $gascorelines{$name};
  next unless (defined ($_));

  @scores = @{$oldscores{$name}} if ( exists $oldscores{$name} );

  # Set appropriate scoreset value
  $scores[$opt_scoreset] = $_;

  # Create new score line
  print OUT join(" ","score",$name,generate_scores(@scores)),"\n";
}
print OUT "\n", $out, "\n";

sub generate_scores {
  my (@scores) = @_;




#!/usr/bin/perl -w

# mboxget - get a message from a mailbox
#
# usage: mboxget [mass-check-mbox-id ...]
#
# <@LICENSE>
# Copyright 2004 Apache Software Foundation
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

use strict;

my $prog = $0;
$prog =~ s@.*/@@;

foreach my $where (@ARGV) {
  my ($file, $offset) = ($where =~ m/(.*?)(?:\.(\d+))?$/);
  open(INPUT, $file) || die("$prog: open $file failed: $!\n");
  if ($offset) {
    seek(INPUT, $offset, 0) || die("$prog: seek $offset failed: $!\n");
  }
  my $past = 0;
  while (<INPUT>) {
    if ($past) {
      last if substr($_,0,5) eq "From ";
    }
    else {
      $past = 1;
    }
    print $_;
  }
  close INPUT;
}




date > test.end

# results name
mv masses.log masses-$net$username.log
mv ham.log ham-$net$username.log

# rsync
set +e
retry=0
while true; do
	if rsync -CPcvuzb --timeout=120 masses-$net$username.log $username@rsync.spamassassin.org::corpus/; then
		break;
	fi
	if [ $retry -eq 120 ]; then

	sleep 30
done
set -e





    @files = sort readdir(CORPUS);
    closedir(CORPUS);

    @files = grep { /^masses-(?:net-)?\w+\.log$/ && -f "$opt{corpus}/$_" && -M _ < 10 } @files;
    @files = grep {
	my $time = 0;
	my $tag = 0;

	}
	$time;
    } @files;

}

sub rename {


	    next if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/);

	    print STDERR "logs: " . join(' ', @files) . "\n";
	    my @spam = grep { /^spam/ } @files;

	    print STDERR "ham: " . join(' ', @ham) . "\n";
	    print STDERR "spam: " . join(' ', @spam) . "\n";

	    chdir $opt{corpus};

	    # net vs. local
	    if ($class eq "NET") {
		@files = grep { /-net-/ } @files;
		print STDERR "logs: " . join(' ', @files) . "\n";
		print STDERR "ham: " . join(' ', @ham) . "\n";
		print STDERR "spam: " . join(' ', @spam) . "\n";
	    }
	    else {
		# if both net and local exist, use newer
		my %spam;
		my %ham;
		
		for my $file (@files) {
		    $logs{$1}++ if ($file =~ m/-(\w+)\.log$/);
		}
		while (my ($user, $count) = each %logs) {
		    $ham{$1}++ if ($file =~ m/-(\w+)\.log$/);
		}
		while (my ($user, $count) = each %ham) {
		    if ($count > 1) {
			my $nightly = "masses-$user.log";
			my $weekly = "masses-net-$user.log";
			if ($revision{$nightly} >= $revision{$weekly}) {
			    @files = grep { $_ ne $weekly } @files;
			}
			else {
			    @files = grep { $_ ne $nightly } @files;
			}
		    }
		}
		print STDERR "logs: " . join(' ', @files) . "\n";
		    if ($count > 1) {
			my $nightly = "spam-$user.log";
			my $weekly = "spam-net-$user.log";
			if ($revision{$nightly} >= $revision{$weekly}) {
			    @spam = grep { $_ ne $weekly } @spam;
			}
			else {
			    @spam = grep { $_ ne $nightly } @spam;
			}
		    }
		}
		print STDERR "ham: " . join(' ', @ham) . "\n";
		print STDERR "spam: " . join(' ', @spam) . "\n";
	    }
	    
	    # age
	    if ($class eq "NET" && $age ne "7day") {
		@files = grep { -M "$_" < 10 } @files;
		@spam = grep { -M "$_" < 10 } @spam;
		# find most recent CVS revision
		my $wanted = 0.0;
		for (@spam, @ham) {
		    $wanted = $revision{$_} if ($revision{$_} > $wanted);
		}
		@files = grep { $revision{$_} eq $wanted } @files;

		print STDERR "logs: " . join(' ', @files) . "\n";
		print STDERR "spam: " . join(' ', @spam) . "\n";
	    }
	    elsif ($age =~ /^(?:new|all|age)$/) {
		@files = grep { -M "$_" < -M $opt{tagtime} } @files;

		@files = grep { $revision{$_} eq $revision } @files;

		print STDERR "logs: " . join(' ', @files) . "\n";
		print STDERR "spam: " . join(' ', @spam) . "\n";
	    }
	    elsif ($age =~ /(\d+)day/) {
		my $mtime = $1;
		@files = grep { -M "$_" < $mtime } @files;

		print STDERR "logs: " . join(' ', @files) . "\n";
		print STDERR "spam: " . join(' ', @spam) . "\n";
	    }
	    
	    open(OUT, "> $opt{html}/$class.$age");
	    print OUT "# results used: " . join(" ", @files) . "\n";

	    for (@files) {
		print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision;
	    }
	    for (@spam) {
		print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision;
	    }

	    my $flags = "";
	    $flags = "-t net -s 1" if $class eq "NET";
	    $flags = "-M HTML_MESSAGE" if $class eq "HTML";

	    if ($age eq "all") {
		my %logs;
		my %ham;
		my @output;
		
		for my $file (@files) {
		    $logs{$1} = $file if ($file =~ m/-(\w+)\.log$/);
		}

		unlink "$opt{tmp}/masses.log.$$";

		next unless (scalar keys %logs);
		for my $user (sort keys %logs) {

		for my $user (sort keys %spam) {
		    next unless defined $ham{$user};
		    chdir "$opt{tree}/masses";
		    system("cat $opt{corpus}/$logs{$user} >> $opt{tmp}/masses.log.$$");
		    open(IN, "./hit-frequencies -xpa $flags -l $opt{corpus}/$logs{$user} |");
		    open(IN, "./hit-frequencies -xpa $flags $opt{corpus}/$spam{$user} $opt{corpus}/$ham{$user} |");
		    while(<IN>) {
			chomp;
			push @output, "$_:$user\n";
		    }
		    close(IN);
		}
		open(IN, "./hit-frequencies -xpa $flags -l $opt{tmp}/masses.log.$$ |");
		while(<IN>) {
		    push @output, $_;
		}

		    my ($after, $before) = split(/-/, $which);
		    # get and filter logs
		    chdir $opt{corpus};

		    open(TMP, "> $opt{tmp}/masses.log.$$");
		    for my $file (@files) {
		      open(IN, $file);
		      while (<IN>) {
			print TMP $_ if time_filter($after, $before);
		      }
		      close(IN);
			    close(IN);
			}
			close (TMP);
		    }
		    close (TMP);

		    # print out by age
		    chdir "$opt{tree}/masses";
		    open(IN, "./hit-frequencies -xpa $flags -l $opt{tmp}/masses.log.$$ |");
		    while(<IN>) {
			chomp;
			push @output, "$_:$which\n";

		    print OUT $_;
		}
	    }
	    elsif (@files) {
		# get logs
		system("cat " . join(" ", @files) . " > $opt{tmp}/masses.log.$$");
		system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$");
	
		chdir "$opt{tree}/masses";
		open(IN, "./hit-frequencies -xpa $flags -l $opt{tmp}/masses.log.$$ |");
		while(<IN>) {
		    print(OUT);
		}





HOW TO GENERATE YOUR OWN SCORES FOR SPAMASSASSN
-----------------------------------------------

Duncan Findlay
<duncf@debian.org>


1. Introduction

One of the reasons SpamAssassin is so accurate is that it's scores are
carefully optimized based on collections (aka. corpus, plural:
corpora) of mail from volunteers all across the world. Each volunteer
uses a script ("mass-check") to run SpamAssassin over each piece of
mail in their corpus. They then submit the results to a central server
where the SpamAssassin development team runs the scoring mechanism to
generate optimal scores.

SpamAssassin uses four different scoresets depending on the options
used. These are almost always referred to by number, as shown below:

Scoreset
   0	 - Network tests disabled, Bayes disabled
   1     - Network tests enabled, Bayes disabled
   2     - Network tests disabled, Bayes enabled
   3     - Network tests enabled, Bayes enabled

Things are further complicated by the fact that when Bayes is enabled,
it automatically learns using the equivalent scoreset with Bayes
disabled. As a result, optimal scores for scoresets 2 and 3 can only
be generated after scoresets 0 and 1. Set 0 logs can be generated from
set 1 logs, but sets 2 and 3 need to be done separately.

As a result, volunteers who take part in our rescoring survey need to
run 3 mass-checks, each of which can take many hours. Since the
generation of scores is such a labourious process, the SpamAssassin
developers only perform this once per release.

Luckily, the previous score optimizer, a Genetic Algorithm, which took
almost 24 hours to optimize scores for one scoreset has been replaced
with the Perceptron (thanks to Henry Stern) which uses a "Stochastic
Gradient Descent" method. Don't worry if you don't understand what
this means, I certainly don't. The Perceptron takes less than 15
seconds to generate scores of roughly equal quality as the GA.


2. Compiling a Corpus

The first step to generating your own scores it to start collecing
mail, both ham (non-spam) and spam. These should be representative of
all the mail you receive, but you should filter out spam related
lists, like spamassassin-users to avoid skewing results. It is
essential that these corpora be very well classified. It will greatly
reduce the effectiveness of your scores if spam mails get misfiled
into your ham folder and vice versa.

Also, it is important to note that SpamAssassin is not designed to be
a virus filter, so it's best if you filter out viurses from your ham
and spam folders too.

Furthermore, since spam and ham characteristics change over time, it's
best to leave out mail over 6 months. This is especially important for
network tests, since these are designed to stop current spam, and are
not historical records.

I'm not entirely sure how big corpora should be. The bigger, the
better. If your corpus is too small, it may not be sufficiently
representative of all the mail you receive, and accuracy will
suffer. My corpus of mail for the last 6 months is over 55000 messages
(35000 spam, 20000 ham).


3. Mass-check

Now that you've assembled your corpora, you need to use mass-check to
test each message with SpamAssassin. This script is surprisingly fast,
as it accesses the internal perl libraries of SpamAssassin, without
the need to load a new perl process each time (as you would if you
piped each message through spamassassin). Doing a scoreset 2 run (no
network, bayes enabled) I get roughly 10,000 messages an hour on an
unloaded Pentium 4, 2.80Ghz computer with 512 MB RAM.

By default, if you are not running out of an unpacked source tree,
mass-check will read rules from the usual locations. As a result, you
should make sure ~/.spamassassin/user_prefs contains no rules, unless
you are planning on using your generated scores for only yourself, not
sitewide.

The first step is to define the locations of all of the messages in
your corpora (these are known as "targets"). I find it's easiest to
put this in a separate file with line of the following format:

class:format:location

Class is either "spam" or "ham", format is "mbox", "file", "dir" or
"mbx" and location is the path to the mailbox. mass-check supports
using * as a wildcard, so the following target is permitted:

spam:mbox:/home/duncf/Maildir/Old/spam/*

Once you have placed all the "targets" necessary for your corpora, run
mass-check with the following command.

mass-check -f file

If you doing a mass-check run for scoreset 1 or 3 (i.e. network tests
enabled) you will also need to add the --net option, and you will want
to add -j8 (or some other number) to indicate how many messages to
test in parallel. This is useful since a lot of time would otherwise
be spent waiting for network queries to return.

mass-check will generate a log file in the current directory entitled
masses.log. This is the log file that will enable us to optimize
scores.

For the impatient: if you're one of those people who want to know
exactly how far mass-check has gotten through your mail, use the
--showdots option.


4. Checking the quality of your corpora (a.k.a. Pulling Weeds)

In order to ensure that your corpora don't contain misfiled mails, it
is good to double check the highest scoring hams and lowest scoring
spams.

First check ham mail:

grep "^h" masses.log | sort -rn -k2,2 | head -20

If you want to read the corresponding messages try piping to
extract-message-from-mbox -m (see the extract-message-from-mbox
section for more detail).

Do the same with spam mail:

grep "^s" masses.log | sort -n -k2,2 | head -20


5. extract-message-from-mbox

extract-message-from-mbox takes a mbox filename and a byte offset and
outputs the corresponding mail message. With the -m option, mass-check
output (i.e. lines from masses.log) is read from the standard
input. Without, arguments are expected to be in the form
<mbox>.<offset> (i.e. /path/to/mbox.12345)

The -h option can also be used to only show message headers.

As shown above, it is quite useful to pipe portions of masses.log to
extract-message-from-mbox.


6. hit-frequencies

hit-frequencies doesn't really help you advance toward your goal of
optimizing scores, but it is very useful in evaluating locally created
rules. Run it, look at it's output; you'll find it intersting (and if
not, feel free to skip to the next section).

hit-frequencies -x -p -s <scoreset>

hit-frequencies (and many other scripts) are set to automatically
guess where to find your configuration files based on
masses.log. Unfortunately, it isn't perfect (actually it's a rather
crude hack, but that's irrelevant). You may have to check masses.log
to figure out where it's searching and/or add --cffile options (you
can specify multiple paths using multiple --cffile options).

hit-frequencies -x -p generates the following output:

OVERALL%   SPAM%     HAM%     S/O    RANK   SCORE  NAME
  64008    40932    23076    0.639   0.00    0.00  (all messages)
100.000  63.9483  36.0517    0.639   0.00    0.00  (all messages as %)
 10.382  16.2342   0.0000    1.000   1.00    3.10  FORGED_MUA_OUTLOOK
  8.266  12.9263   0.0000    1.000   0.99    1.00  FORGED_OUTLOOK_TAGS
  6.484  10.1388   0.0000    1.000   0.98    4.50  DRUGS_ERECTILE_OBFU
[...]

The first two rows show the size of the corpora and their ham/spam
break down. The following lines list each rule found and give various
statistics about it based on your masses.log.

OVERALL% represents the percentage of total messages (spam and ham)
that the rule hits, SPAM% and HAM% show the percentages on each
corpus. S/O is the SPAM% divided by the OVERALL%. Generally good
(non-nice) rules have S/O's over 0.95, while nice (negative-scoring)
rules generally have S/O's less than 0.5. RANK is a human readable
indicator of how good a rule is. The higher the better, always. RANK
is designed to be a rough indicator of the score the perceptron is
likely to give it. SCORE is simply the current score. (This is simply
listed for convenience, not calculated in any way.)

If you do any rule development locally, you will find this is a great
tool. If you come up with some great rules (that we haven't already
thought of), please send us a patch at
http://bugzilla.spamassassin.org/.


7. lint-rules-from-freqs

This script is designed to read in your masses.log and the
SpamAssassin configuration files in order to find both bad syntax and
bad rules that hit few messages or (with -f) have too many false
positives/negatives, etc.

lint-rules-from-freqs -f -s <scoreset>

As with hit-frequencies, it tries to be smart with choosing the right
--cffile options.

This script is roughly the equivalent of running a spamassassin --lint
and running a hit-frequencies to determine which tests have bad S/O
ratios.


8. logs-to-c

logs-to-c is the program that converts a mass-check log into code that
can be easily used by the perceptron. Currently, it is necessary to
use the output of logs-to-c to even compile perceptron, but that
should hopefully change in the near future.

The files logs-to-c create need to be in the tmp/ sub-directory of the
directory where perceptron.c is.

logs-to-c -o tmp/ -s <scoreset>

These files contain information about each rule such as whether or not
the perceptron is permitted to change the rule's score, the range
within which the perceptron can adjust it, whether or not a rule is
nice, etc. In addition, these files contain information about each
mail hit and which tests were hit. The files generated by logs-to-c
are not really easy to read, so don't try; use hit-frequencies
instead.


9. perceptron

perceptron is the brains behind the whole process. (And we must of
course thank the brain behind perceptron, Henry Stern, for his
contribution.)

While the perceptron takes options for things such as "ham
preference", "number of epochs", "learning rate" and "weight decay",
it's probably best to trust the defaults; unless of course you want to
try to find the optimum parameters (and post them to
http://bugzilla.spamassassin.org/ with your evidence).

The perceptron is incredibly quick. So start it, wait 15 seconds and
voila, your optimized scores are ready. The output is in
perceptron.scores.

Unfortunately, it needs to be built from source every time you want to
use it with a different masses.log or set of rules. In the directory
containing perceptron.c, try:

make perceptron
./perceptron

If you don't have the Makefile, try
gcc -g -O2 -Wall -o perceptron perceptron.c -lm
./perceptron


10. rewrite-cf-with-new-scores

perceptron dumps its results in perceptron.scores. Great. How does
that help you? rewrite-cf-with-new-scores takes care of changing the
old configuration files to correspons with the new scores. The script
takes into account rules found in your configuration, so make sure
that the --cffile argument is right (it'll read this from masses.log
by default). The syntax is:

rewrite-cf-with-new-scores --old 50_scores.cf --new perceptron.scores \
  --out 50_scores.new.cf -l masses.log -s 2

Make sure you don't forget the -s option. You need to tell it which
scoreset to update or it'll update set 0, which is not what you want
(unless you just did a set 0 run, of course).

Note: the statistics in the new scores file are NOT updated. Just the
scores are.

11. fp-fn-statistics

This script calculates how good the scores are ata given threshold. It
returns the number of false positives, false negatives, true
positives, true negatives and a whole variety of fun statistics.

./fp-fn-statistics -s <scoreset> --cffile <path>

fp-fn-statistics also generates a TCR which is essentially an overall
rating of how good the scores are. (This is only accurate when run on
a different corpus of mail than that with which the scores were
generated). TCR stands for "Total Cost Ratio". The higher the number,
the better the set of scores.


12. Submitting corpora for SpamAssassin

If you want to contribute your mass-check logs to the SpamAssassin
rescoring process, please download the latest revision of SpamAssassin
from the subversion repository. See this page of the wiki:
http://wiki.spamassassin.org/DownloadFromSvn

You will want to read CORPUS_POLICY and CORPUS_SUBMIT. We only do
large rescoring runs just before releases, so be sure to follow the
lists which will have more information and reminders on how to
participate.

Please be sure your corpora are of high quality (everything must be
carefully checked to avoid misfilings). Also, we appreciate varied
sources of mail.


13. Other scripts

Only a subset of the scripts used in rule development and scoring have
been documented here. Most of the others aren't really very
useful. You can examine the others by downloading the source from the
subversion repository: http://wiki.spamassassin.org/DownloadFromSvn.
Everything relating to rule QA and development is in the masses/
sub-directory.

The scripts presented here have had man pages written for them, and an
attempt has been made to standardize the options for ease of use. Many
of the others may require some reading of source to understand how
they work and what they do.


14. Frequently Asked Questions

(Since this is the first version of this document, I'm guessing what
questions would otherwise be asked. So this isn't really a "Frequently
Asked Questions" list, but a "What did Duncan fail to address
elsewhere?" list.)

Q. Why don't the scripts automatically guess which scoreset to use like
they do with --cffile?

A. Firstly, mass-check does not know what scoreset
you are running. It could guess, but it probably shouldn't. Secondly,
the same masses.log can be used for multiple scoresets (a set 1 log
can be used to generate scores for sets 0 and 1, by stripping out net
rules etc.)

Q. How can I determine how good the scoring system is?

A. There is a series of scripts in the source directory (in
masses/tenpass/) designed to determine how accurate the perceptron is
by using "10-fold Cross Validation" (10fcv). Basically, the masses.log
is split into 10 "buckets" and each bucket is sequentially used to
validate against scores generated from the remaining 9.


15. Bugs, author, improvements, etc.

SpamAssassin is written and maintained by a group of developers, whose
names can be found in the CREDITS file.

If you have further questions about SpamAssassin or the rescoring
scripts, try the following:

- Ask on one of the SpamAssassin mailing lists:

http://www.spamassassin.org/lists.html

- If you've found a bug, file a report:

http://bugzilla.spamassassin.org/

- Also, check out our wiki:

http://wiki.spamassassin.org/




#!/bin/sh

SCORESET="0"
if [ "x$1" != "x" ] ; then
    SCORESET=$1
fi

NAME="set$SCORESET"
BASE="logs"

if [ ! -f "ORIG/masses-$NAME.log" ]; then
	echo "Couldn't find logs for $NAME" >&2
	exit 1
fi

if [ "x$2" = "x" ]; then
echo "[Doing a scoreset $SCORESET score-generation run]"

# Clean out old runs
echo "[Cleaning up]"

rm -rf masses-validate.log masses.log $BASE tmp make.output freqs \
    perceptron.scores gen-$NAME.out gen-$NAME.scores gen-$NAME.validate
make clean >/dev/null

# Generate 90/10 split logs
echo "[Generating 90/10 split ham]"
mkdir $BASE
cd $BASE
../tenpass/split-log-into-buckets 10 < ../ORIG/masses-$NAME.log > /dev/null
cat split-[1-9].log > masses.log
rm -f split-[1-9].log
mv split-10.log masses-validate.log

echo "[Generating 90/10 split spam]"
cd ../SPBASE
../tenpass/split-log-into-buckets 10 < ../ORIG/spam-$NAME.log > /dev/null
cat split-[1-9].log > spam.log
rm -f split-[1-9].log
mv split-10.log spam-validate.log
cd ..

echo "[Setting up for gen run]"
# Ok, setup for a run
ln -s $BASE/masses.log .
ln -s $BASE/masses-validate.log .
ln -s NSBASE/nonspam.log ham.log
ln -s SPBASE/spam-validate.log .
ln -s NSBASE/nonspam-validate.log .
ln -s NSBASE/nonspam-validate.log ham-validate.log

# try to find number of processors
numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1`


else

echo "Make sure 50_scores.cf has been replaced appropriately"

# This needs to have 50_scores.cf in place first ...
echo "[gen validation results]"
./fp-fn-statistics --logfile=BASE/masses-validate.log \
	--cffile=../rules --scoreset=$SCORESET | tee gen-$NAME.validate
	--count --cffile=../rules --scoreset=$SCORESET | tee gen-$NAME.validate

echo "[STATISTICS file generation]"
./mk-baseline-results $SCORESET | tee gen-$NAME.statistics




# limitations under the License.
# </@LICENSE>

=head1 NAME

lint-rules-from-freqs - Try to find problems with SpamAssassin rules

=head1 SYNOPSIS

lint-rules-from-freqs [options]

 Options:
    -c,--cffile=path	  Use path as the rules directory
    -s,--scoreset=n	  Use scoreset n
    -l,--logfile=file	  Read in file instead of masses.log
    -f			  Also take into account false positives/negatives

=head1 DESCRIPTION

This script analyzes SpamAssassin tests, based on the hit frequencies
and S/O ratios from a mass-check log (masses.log).  This script can
also optionally take into account the false positive/negative
frequencies.

The script first uses the SpamAssassin rules parser to report on any
illegal syntax. Then it checks the rules match frequencies from the
mass-check log in order to determine how effective the rule is.

=head1 BUGS

Please report bugs to http://bugzilla.spamassassin.org/

=head1 SEE ALSO

L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>

=cut


use FindBin;
use lib "$FindBin::Bin/../lib";
use Mail::SpamAssassin::Masses;
use Mail::SpamAssassin;
use Getopt::Long qw(:config bundling auto_help);
use strict;
use warnings;

# any tests that get less than this % of matches on *both* spam or nonspam, are
# reported.
my $LOW_MATCHES_PERCENT = 0.03;
my $scoreset = 0;

use vars qw($opt_c $opt_l $opt_s $opt_f $opt_p);
  die "
lint-rules-from-freqs: perform 'lint' testing on SpamAssassin rules and scores

GetOptions("c|cffile=s@" => \$opt_c,
	   "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
	   "l|logfile=s" => \$opt_l,
	   "f|falses" => \$opt_f);

This analyzes SpamAssassin tests, based on the hit frequencies and S/O ratios
from a mass-check logfile pair.

$opt_s = 0 unless defined $opt_s;
$opt_l ||= "masses.log";

if (!$opt_c || !scalar(@$opt_c)) {
    # Try to read this in from the log, if possible
    open IN, $opt_l or die "Can't open $opt_l: $!";
    my $files = 0; # are we in the files section?
    while(<IN>) {
	if (!$files) {
	    if (/^\# SVN revision:/) {
		$opt_c = [ "$FindBin::Bin/../rules" ];
		last;
	    } elsif (/^\# Using configuration:$/) {
		$files = 1;
	    }
	} elsif (/^\#\s+(.*)\s*$/) {
	    push (@$opt_c, $1);
	} else {
	    # All done!
	    last;
	}
    }

    if (!defined $opt_c) {
      $opt_c = [ "$FindBin::Bin/../rules" ];
    }

    foreach my $file (@$opt_c) {
	die "Can't read $file" unless -r $file;
    }
  if (/^-f/) { $_ = shift @ARGV; $opt_falsefreqs = $_; }
  elsif (/^-s/) { $_ = shift @ARGV; $scoreset = $_; }
  else { usage(); }
}

print "BAD TESTS REPORT\n";
# First, do a --lint

print "\nRule file syntax issues:\n\n";
lintrules();

{
  local (*STDERR) = \*STDOUT; # Get lint errors on STDOUT

  # Read the config ourselves...

  # Read init.pre from each directory, then glob for the rest.

  my $cf_txt = '';
  my @files;
  my @dirs;
  foreach my $file (@$opt_c) {
    if (-d $file) {
      if  (-r "$file/init.pre") {
	push @files, "$file/init.pre";
      }
      push @dirs, $file;
    }
    else {
      push @files, $file;
    }
  }
  foreach my $dir (@dirs) {
    my @cfs = glob("$dir/*.cf");
    push @files, grep { -r $_ } @cfs;
  }

  foreach my $file (@files) {
    if (-r $file) {
      open IN, $file;
      $cf_txt .= "file start $file\n";
      $cf_txt .= join('', <IN>);
      $cf_txt .= "\nfile end $file\n";
      close IN;
    }
  }

  my $spamtest = new Mail::SpamAssassin({config_text => $cf_txt});

  $spamtest->lint_rules();
}


# Next, check for other stuff
my $masses = Mail::SpamAssassin::Masses->new({rulesdir => $opt_c,
					      scoreset => $opt_s, #,,
					      falses => $opt_f,
					      logfile => $opt_l});

$masses->readlogs();
$masses->do_statistics();

my $rules = $masses->get_rules_array();


my %output;

foreach my $rule (@$rules) {

  my $badrule;
  my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
  next unless ($name =~ /\S/);

  next if ($rule->{tflags} =~ /\bnet\b/ && ($opt_s % 2) == 0);
  next if ($rule->{tflags} =~ /\buserconf\b/);
  my $ffso = $falsefreqs_so{$name};

  if ($rule->{freq_spam} == 0 && $rule->{freq_ham} == 0) {        # sanity!
  next if ($tf =~ /net/ && ($scoreset % 2) == 0);
  next if ($tf =~ /userconf/);

  if ($overall == 0.0 && $spam == 0.0 && $nons == 0.0) {        # sanity!
    $badrule = 'no matches';

  } else {
    if ($rule->{score} < 0.0) {
      # negative score with more spams than nonspams? bad rule.
      if (!$rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) {
        $badrule = 'non-nice but -ve score';
      }
      if ($rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) {
        if ($opt_f && $rule->{freq_fn} < $rule->{freq_fp}) {
        if ($ffso < 0.5) {
          $badrule = 'fn';
        } else {
          # ignore, the FNs are overridden by other tests so it doesn't
          # affect the overall results.
        }
        # else {
        # ignore, the FNs are overridden by other tests so it doesn't
        # affect the overall results.
        # }
      }

      # low number of matches overall
      if ($rule->{ham_percent} < $LOW_MATCHES_PERCENT)
                 { $badrule ||= ''; $badrule .= ', low matches'; }

    } elsif ($rule->{score} > 0.0) {
      # positive score with more nonspams than spams? bad.
      if ($rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) {
        $badrule = 'nice but +ve score';
      }
 
      if (!$rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) {
        if ($opt_f && $rule->{freq_fp} > $rule->{freq_fn}) {
          $badrule = 'fp';
        } else {
          # ignore, the FPs are overridden by other tests so it doesn't
          # affect the overall results.
        }
        # else {
        # ignore, the FPs are overridden by other tests so it doesn't
        # affect the overall results.
        # }
      }
 
      # low number of matches overall
      if ($rule->{spam_percent} < $LOW_MATCHES_PERCENT)
                 { $badrule ||= ''; $badrule .= ', low matches'; }
 
    } elsif ($rule->{score} == 0.0) {
      $badrule = 'score is 0';
    }
  }
 
  if (defined $badrule) {
    $badrule =~ s/^, //;
    $output{$badrule} .= $rule->{name} . " ($badrule)\n";
  }
}


exit;


sub concat_rule_lang {
  my $rule = shift;
  my $lang = shift;

  if (defined $lang && $lang ne '') {
    return "[$lang]_$rule";
  } else {
    return $rule;
  }
}

# note: do not use parse-rules-for-masses here, we need to do linting instead
# of your average parse
sub readrules {
  my @files = <../rules/[0-9]*.cf>;
  my $file;
  %rulesfound = ();
  %langs = ();
  foreach $file (@files) {
    open (IN, "<$file");
    while (<IN>) {
      s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;

      # make all the foo-bar stuff foo_bar
      1 while s/^(\S+)-/\1_/g;
      1 while s/^(lang\s+\S+\s+\S+)-/\1_/g;

      my $lang = '';
      if (s/^lang\s+(\S+)\s+//) {
        $lang = $1; $langs{$1} = 1;
      }

      if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) {
        $rulesfound{$2} = 1;
        $rulefile{$2} ||= $file;
        $scorefile{$1} = $file;
        $score{$2} ||= 1.0;
        $tflags{$2} ||= '';
        $descfile{$2} ||= $file;       # a rule with no score or desc is OK
	$description{$2}->{$lang} = undef;

        if (/^body\s+\S+\s+eval:/) {
          # ignored
        } elsif (/^body\s+\S+\s+(.*)$/) {
          my $re = $1;

	  # If there's a ( in a rule where it should be (?:, flag it.
	  # but ignore [abc(] ...
          if ($re =~ /[^\\]\([^\?]/ && $re !~ /\[[^\]]*[^\\]\(/) { 
            print "warning: non-(?:...) capture in regexp in $file: $_\n";
          }
          if ($re =~ /\.[\*\+]/) { 
            print "warning: .* in regexp in $file: $_\n";
          }
          if ($re =~ /[^\\]\{(\d*),?(\d*?)\}/) {
            if ($1 > 120 || $2 > 120) {
              print "warning: long .{n} in regexp in $file: $_\n";
            }
          }
        }

      } elsif (/^describe\s+(\S+)\s+(.*?)\s*$/) {
        $rulesfound{$1} = 1;
        $descfile{concat_rule_lang ($1, $lang)} ||= $file;
        $descfile{$1} ||= $file;
	$description{$1}->{$lang} = $2;
      } elsif (/^tflags\s+(\S+)\s+(.+)$/) {
        $rulesfound{$1} = 1;
        $tflags{$1} = $2;
        $tflagsfile{concat_rule_lang ($1, $lang)} = $file;
        $tflagsfile{$1} = $file;
      } elsif (/^score\s+(\S+)\s+(.+)$/) {
        $rulesfound{$1} = 1;
        $scorefile{concat_rule_lang ($1, $lang)} = $file;
        $scorefile{$1} = $file;
        $score{$1} = $2;
      } elsif (/^(clear_report_template|clear_spamtrap_template|report|spamtrap|
                clear_terse_report_template|terse_report|
                required_score|ok_locales|ok_languages|test|lang|
                spamphrase|whitelist_from|require_version|
		clear_unsafe_report_template|unsafe_report|
		(?:bayes_)?auto_learn_threshold_nonspam|(?:bayes_)?auto_learn_threshold_spam|
		(?:bayes_)?auto_learn
                )/x) {
        next;
      } else {
        print "warning: unknown rule in $file: $_\n";
      }
    }
    close IN;
  }
  @langsfound = sort keys %langs;
  @rulesfound = sort keys %rulesfound;
}

sub lintrules {
  my %possible_renames = ();

  foreach my $rule (@rulesfound) {
    my $match = $rule;
    $match =~ s/_\d+[^_]+$//gs;    # trim e.g. "_20K"
    $match =~ s/[^A-Z]+//gs;    # trim numbers etc.

    if (defined ($rulefile{$rule}) && $possible_renames{$match} !~ / \Q$rule\E\b/) {
      $possible_renames{$match} .= " ".$rule;
    }
    $possible_rename_matches{$rule} = $match;
  }

  foreach my $lang ('', @langsfound) {
    foreach my $baserule (@rulesfound) {
      next if ( $baserule =~ /^__/ || $baserule =~ /^T_/ );

      my $rule = concat_rule_lang ($baserule, $lang);
      my $f = $descfile{$rule};
      my $warned = '';

      if (defined $f && !defined ($rulefile{$rule})
                && !defined ($rulefile{$baserule}))
      {
        print "warning: $baserule has description, but no rule: $f\n";
        $warned .= ' lamedesc';
      }

	# Check our convention for rule length
	if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && length $baserule > 22 ) {
	  print "warning: $baserule has a name longer than 22 chars: $f\n";
	}
 	# Check our convention for rule length
	if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && defined $description{$baserule}->{$lang} && length $description{$baserule}->{$lang} > 50 ) {
	  print "warning: $baserule has a description longer than 50 chars: $f\n";
	}

      # lang rule trumps normal rule
      $f = $rulefile{$rule} || $rulefile{$baserule};
      # if the rule exists, and the language/rule description doesn't exist ...
      if ( defined $f && !defined $description{$baserule}->{$lang} )
      {
        print "warning: $baserule exists, ",( $lang ne '' ? "lang $lang, " : "" ),"but has no description: $f\n";
        $warned .= ' lamedesc';
      }


      $f = $scorefile{$rule};
      if (defined $f && !defined ($rulefile{$rule})
                && !defined ($rulefile{$baserule}))
      {
        print "warning: $baserule has score, but no rule: $f\n";
        $warned .= ' lamescore';
      }

      my $r = $possible_rename_matches{$rule};
      if ($warned ne '' && defined $r) {
        my @matches = split (' ', $possible_renames{$r});
        if (scalar @matches != 0) {
          my $text = '';

          # now try and figure out "nearby" rules with no description/score
          foreach my $baser (@matches) {
            my $blang;
            if ($descfile{$rule} =~ /text_(\S\S)\./) {
              $blang = $1;
            }
            my $r = concat_rule_lang ($baser, $blang);
            #warn "$r $descfile{$r} $descfile{$baser}";
            next if ($warned =~ /lamedesc/ && (defined $descfile{$r}));
            next if ($warned =~ /lamescore/ && (defined $scorefile{$r}));
            $text .= " $baser";
          }

          if ($text ne '') {
            print "warning: (possible renamed rule? $text)\n";
          }
        }
      }
    }
  }
}





LDFLAGS=	-lm

# What rule scoreset are we using?
SCORESET =	3
LOGFILE =	masses.log

#### Should be no need to modify below this line

all: badrules perceptron

perceptron: perceptron.o
	$(CC) -o perceptron perceptron.o $(LDFLAGS) 

perceptron.o: tmp/tests.h
	$(CC) $(CFLAGS) -c -o perceptron.o perceptron.c

tmp/tests.h: tmp/.created logs-to-c
	perl logs-to-c --scoreset=$(SCORESET) --logfile=$(LOGFILE)

freqs: masses.log
	perl hit-frequencies -x -p -s $(SCORESET) --logfile=$(LOGFILE) > freqs

tmp/scores.h: tmp/tests.h

tmp/ranges.data: tmp/.created freqs score-ranges-from-freqs
	perl score-ranges-from-freqs ../rules $(SCORESET) < freqs

freqs: spam.log ham.log
	perl hit-frequencies -x -p -s $(SCORESET) > freqs

badrules: freqs
	perl lint-rules-from-freqs -s $(SCORESET) --logfile=$(LOGFILE) > badrules

tmp/.created:
	-mkdir tmp




# limitations under the License.
# </@LICENSE>

=head1 NAME
  die <<ENDOFUSAGE;
usage: mass-check [options] target ...
 
  -c=file       set configuration/rules directory
  -p=dir        set user-prefs directory
  -f=file       read list of targets from <file>
  -j=jobs       specify the number of processes to run simultaneously
  --net         turn on network checks!
  --mid         report Message-ID from each message
  --debug       report debugging information
  --progress    show progress updates during check
  --rewrite=OUT save rewritten message to OUT (default is /tmp/out)
  --showdots    print a dot for each scanned message
  --rules=RE    Only test rules matching the given regexp RE
  --restart=N   restart all of the children after processing N messages
  --deencap=RE  Extract SpamAssassin-encapsulated spam mails only if they
                were encapsulated by servers matching the regexp RE
                (default = extract all SpamAssassin-encapsulated mails)
 
  log options
  -o            write all logs to stdout
  --loghits     log the text hit for patterns (useful for debugging)
  --loguris	log the URIs found
  --hamlog=log  use <log> as ham log ('ham.log' is default)
  --spamlog=log use <log> as spam log ('spam.log' is default)
 
  message selection options
  -n            no date sorting or spam/ham interleaving
  --after=N     only test mails received after time_t N (negative values
                are an offset from current time, e.g. -86400 = last day)
                or after date as parsed by Time::ParseDate (e.g. '-6 months')
  --before=N    same as --after, except received times are before time_t N
  --all         don't skip big messages
  --head=N      only check first N ham and N spam (N messages if -n used)
  --tail=N      only check last N ham and N spam (N messages if -n used)
 
  simple target options (implies -o and no ham/spam classification)
  --dir         subsequent targets are directories
  --file        subsequent targets are files in RFC 822 format
  --mbox        subsequent targets are mbox files
  --mbx         subsequent targets are mbx files
 
  Just left over functions we should remove at some point:
  --bayes       report score from Bayesian classifier
 
  non-option arguments are used as target names (mail files and folders),
  the target format is: <class>:<format>:<location>
  <class>       is "spam" or "ham"
  <format>      is "dir", "file", "mbx", or "mbox"
  <location>    is a file or directory name.  globbing of ~ and * is supported

mass-check - Generates SpamAssassin scores and results for large
amounts of mail

=head1 SYNOPSIS

 mass-check [options] class:format:location ...
 mass-check [options] {--dir | --file | --mbox} target ...
 mass-check [options] -f file

  Options:
    -f=file       read list of targets from <file>
    -j=jobs       specify the number of processes to run simultaneously
    --net         turn on network checks!
    --mid         report Message-ID from each message
    --debug       report debugging information
    --progress    show progress updates during check
    --rewrite=OUT save rewritten message to OUT (default is /tmp/out)
    --showdots    print a dot for each scanned message
    --rules=RE    Only test rules matching the given regexp RE
    --restart=N   restart all of the children after processing N messages

    SpamAssassin options
    -c=dir        set configuration/rules directory
    -p=file       set user preferences file (default: none)
    -s=dir        set site rules configuration directory
    -u=dir        set user-state directory
    --dist        assumes the script is being run from the masses/ dir of
                  the unpacked tarball, and makes appropriate guesses for
                  -p and -c
    --deencap=RE  Extract SpamAssassin-encapsulated spam mails only if they
                  were encapsulated by servers matching the regexp RE
                  (default = extract all SpamAssassin-encapsulated mails)

    log options
    -o            write all logs to stdout
    --loghits     log the text hit for patterns (useful for debugging)
    --loguris	  log the URIs found
    --log=file    log to <file> (masses.log is default)

    message selection options
    -n            no date sorting or spam/ham interleaving
    --after=N     only test mails received after time_t N (negative values
                  are an offset from current time, e.g. -86400 = last day)
                  or after date as parsed by Time::ParseDate (e.g. '-6 months')
    --before=N    same as --after, except received times are before time_t N
    --all         don't skip big messages
    --head=N      only check first N ham and N spam (N messages if -n used)
    --tail=N      only check last N ham and N spam (N messages if -n used)

    simple target options (implies -o and no ham/spam classification)
    --dir         subsequent targets are directories
    --file        subsequent targets are files in RFC 822 format
    --mbox        subsequent targets are mbox files
    --mbx         subsequent targets are mbx files

    Just left over functions we should remove at some point:
    --bayes       report score from Bayesian classifier
    --hamlog=log  use <log> as ham log ('ham.log' is default)
    --spamlog=log use <log> as spam log ('spam.log' is default)

=head1 DESCRIPTION

B<mass-check> is designed to assist with rule development and
generation of SpamAssassin scored. It reads in mail from the
location(s) specified on the command line (in the first form above),
given in the form I<class:format:location>, where I<class> is either
"spam" or "ham" (non-spam), I<format> is one of "dir" (Maildirs, MH,
etc), "file", "mbox" (mboxes can be gzipped) or "mbx".

B<mass-check> will analyze each message using SpamAssassin and
generate one-line of output per message, (by default to masses.log) in
the following format:

 {s|h} {s|h} score filename tests-hit

The first field is the message's class as given on the command line
(ham or spam). The second is the message's class as determined by
SpamAssassin. The third is the message's score, as determined by
SpamAssassin. The fourth field contains the message's filename; for
mboxes, this contains the filename and the byte offset from the
beginning of the file separated by a period. The last field contains a
list of all the tests the message hit separated by commas.

If you want to run this on the currently installed version of
SpamAssassin's rules for sitewide use, make sure your user_prefs file
contains no rules.

=head1 BUGS

Please report bugs to http://bugzilla.spamassassin.org/

=head1 SEE ALSO

L<hit-frequencies(1)>, L<logs-to-c(1)>, L<Mail::SpamAssassin::Masses(3)>,
L<perceptron(1)>

=cut

###########################################################################

use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all
	    $opt_bayes $opt_before $opt_debug $opt_dist $opt_format
	    $opt_hamlog $opt_head $opt_log $opt_loghits $opt_mid
	    $opt_mh $opt_ms $opt_net $opt_nosort $opt_p $opt_progress
	    $opt_s $opt_showdots $opt_spamlog $opt_tail $opt_rules
	    $opt_restart $opt_loguris $opt_after $opt_rewrite $opt_u
	    $opt_deencap);

use FindBin;
use lib "$FindBin::Bin/../lib";
eval "use bytes";
use Mail::SpamAssassin::ArchiveIterator;
use Mail::SpamAssassin;
use Getopt::Long qw(:config bundling auto_help);
use Pod::Usage;
use POSIX qw(strftime);
use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };
use strict; # Why wasn't this on?
use Config;

# default settings

$opt_p = "$FindBin::Bin/spamassassin";
$opt_j = 1;
$opt_net = 0;
$opt_log = "masses.log";
$opt_spamlog = "spam.log";

GetOptions("c|cffile=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug",
	   "deencap=s", "dist!", "hamlog=s", "head=i", "log=s",
	   "loghits", "mh", "mid", "ms", "net", "p=s", "progress",
	   "rewrite:s", "s=s", "showdots", "spamlog=s", "tail=i",
	   "rules=s", "restart=i", "u=s", "after=s", "loguris",
	   "dir" => sub { $opt_format = "dir"; },
	   "file" => sub {$opt_format = "file"; },
	   "mbox" => sub { $opt_format = "mbox"; },
	   "mbx" => sub { $opt_format = "mbx"; },
	   '<>' => \&target);

if ($opt_hamlog || $opt_spamlog) { # Old style logging
  $opt_hamlog ||= "ham.log";
  $opt_spamlog ||= "spam.log";
}

my @targets;

if ($opt_f) {
  open(F, $opt_f) || die $!;
  push(@targets, map { chomp; $_ } <F>);
  close(F);
}

if (scalar @targets == 0) { pod2usage("No target defined!"); }

# Auto-detect --dist option
if (!defined $opt_dist) {
  if (-f "$FindBin::Bin/../spamassassin.raw") {
    warn "Automatically using --dist. Assuming you are running from the unpacked tarball. Use --no-dist to override.";
    $opt_dist = 1;
  }
}

my $local_rules_dir;
  'debug'              			=> $opt_debug,
  'rules_filename'     			=> $opt_c,
  'userprefs_filename' 			=> "$opt_p/user_prefs",
  'site_rules_filename'			=> "$opt_p/local.cf",
  'userstate_dir'     			=> "$opt_p",
  'save_pattern_hits'  			=> $opt_loghits,
  'dont_copy_prefs'   			=> 1,
  'local_tests_only'   			=> $opt_net ? 0 : 1,
  'only_these_rules'   			=> $opt_rules,
  'ignore_safety_expire_timeout'	=> 1,
  PREFIX				=> '',
  DEF_RULES_DIR        			=> $opt_c,
  LOCAL_RULES_DIR      			=> '',
});

if ($opt_dist) { # Set defaults
  $opt_c ||= "$FindBin::Bin/../rules";
  $opt_p ||= "$FindBin::Bin/mass-check.cf";
  $opt_u ||= "$FindBin::Bin/spamassassin";
  $opt_s ||= "$FindBin::Bin/spamassassin";
  $local_rules_dir = '';
}
else {
  if(!$opt_u) {
    # Assuming this is OK, since mass-check isnt supported on windows, is it?
    # Also, should there be some check to make sure that previous mass-check stuff isn't in there?
    # AFAICT, there isn't otherwise....
    if ( -d "${ENV{HOME}}/.spamassassin" ) {
      $opt_u = "${ENV{HOME}}/.spamassassin/mass-check";
      warn "$opt_u already exists -- may contain files that will effect the results" if (-d $opt_u);
      mkdir $opt_u, 0700 if (! -d $opt_u);
    }
  }

# Leave the rest to SA, we'll get it afterwards

}


$opt_s =~ s/~/$ENV{HOME}/ if $opt_s;
$opt_c =~ s/~/$ENV{HOME}/ if $opt_c;
$opt_p =~ s/~/$ENV{HOME}/ if $opt_p;
$opt_u =~ s/~/$ENV{HOME}/ if $opt_u;


my $spamtest = new Mail::SpamAssassin ({
				       'debug'              			=> $opt_debug,
				       'rules_filename'     			=> $opt_c,
				       'userprefs_filename' 			=> $opt_p,
				       'site_rules_filename'			=> $opt_s,
				       'userstate_dir'     			=> $opt_u,
				       'save_pattern_hits'  			=> $opt_loghits,
				       'dont_copy_prefs'   			=> 1,
				       'local_tests_only'   			=> $opt_net ? 0 : 1,
				       'only_these_rules'   			=> $opt_rules,
				       'ignore_safety_expire_timeout'	=> 1,
				       DEF_RULES_DIR        			=> $opt_c,
				       LOCAL_RULES_DIR      			=> $local_rules_dir,
				      });

$spamtest->compile_now(1);
if ($opt_dist) {
  $spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf");
}

my $who   = `id -un 2>/dev/null`;   chomp $who;
my $where = `uname -n 2>/dev/null`; chomp $where;
my $when  = `date -u`;              chomp $when;
my $revision;

if ($opt_dist) {
  my $rev = "unknown";
  if (open(TESTING, "$opt_c/70_testing.cf")) {
    chomp($rev = <TESTING>);
    $rev =~ s/.*\$Rev:\s*(\S+).*/$1/;
    close(TESTING);
  }
  $revision = "SVN revision: $rev";
}
else {
  $revision = "Local";
}

my $log_header = "# mass-check results from $who\@$where, on $when\n" .
		 "# M:SA version ".$spamtest->Version()."\n" .
		 "# $revision\n" .
		 "# Perl version: $] on $Config{archname}\n";

if (!$opt_dist) {
  my @paths = ( $spamtest->{rules_filename}, $spamtest->{site_rules_filename}, $spamtest->{userprefs_filename} );
  $log_header .= "# Using configuration:\n";
  foreach my $file (@paths) {
    $log_header .=  "# $file\n";
  }
}

my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || `hostname` || 'localhost';
chomp $host;


    autoflush STDOUT 1;
    print STDOUT $log_header;
  }
  elsif ($opt_hamlog || $opt_spamlog) {
    open(HAM, "> $opt_hamlog");
    open(SPAM, "> $opt_spamlog");
    autoflush HAM 1;

    print HAM $log_header;
    print SPAM $log_header;
  }
  else {
    open(OUT, "> $opt_log");
    autoflush OUT 1;
    print OUT $log_header;
  }
  $init_results = 1;
}


  # don't open results files until we get here to avoid overwriting files
  &init_results if !$init_results;

  if ($opt_o) {
    print STDOUT $result;
    $spam_count++;
  }
  elsif ($opt_spamlog || $opt_hamlog) {
    if ($class eq "s") {
      print SPAM $result;
    } else {
      print HAM $result;
    }
  }
  else {
    print OUT $result;
  }

  $total_count++;
#warn ">> result: $total_count $class $time\n";

  if ($opt_progress) {
    if ($class eq "s") {
      $spam_count++;
    }
    else {
      $ham_count++;
    }
    progress($time);
  }
}

sub wanted {
  my ($class, $id, $time, $dataref) = @_;
  my $out;

  my $ma = $spamtest->parse($dataref, 1);

    push(@extra, "mid=$mid");
  }

  my $result;
  my $score;
  my $tests;
  my $extra;

  if ($opt_loguris) {
    $result = '.';
    $score = 0;
    $tests = join(" ", sort @uris);
    $extra = '';
  } else {
    if ($status->is_spam()) {
      $result = "s";
    } else {
      $result = "h";
    }
    $score = $status->get_score();
    $tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit())));
    $extra = join(",", @extra);


  $id =~ s/\s/_/g;

  $out .= sprintf("%s %s %05.2f %s %s %s\n", $class, $result, $score, $id, $tests, $extra);

  if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) {
    $out .= logkilled($ma, $id, "possible virus");

Lines 10-16 Link Here

(-)masses/mk-baseline-results (-2 / +2 lines)
10	echo "Classification success on test corpora, at default threshold:"	10	echo "Classification success on test corpora, at default threshold:"
11	echo	11	echo
12		12
13	./logs-to-c --spam=spam-validate.log --nonspam=nonspam-validate.log --threshold 5 --count --scoreset=$SCORESET \| sed -e 's/^Reading.*//' -e '/^$/d'	13	./fp-fn-statistics --logfile=masses-validate.log --threshold 5 --scoreset=$SCORESET
14		14
15	echo	15	echo
16	echo "Results on test corpora at various alternative thresholds:"	16	echo "Results on test corpora at various alternative thresholds:"
Lines 18-24 Link Here
18		18
19	# list a wide range of thresholds, so that we can make graphs later ;)	19	# list a wide range of thresholds, so that we can make graphs later ;)
20	for thresh in -4 -3 -2 -1 0 1 2 3 4 4.5 5.5 6 6.5 7 8 9 10 12 15 17 20 ; do	20	for thresh in -4 -3 -2 -1 0 1 2 3 4 4.5 5.5 6 6.5 7 8 9 10 12 15 17 20 ; do
21	./logs-to-c --spam=spam-validate.log --nonspam=nonspam-validate.log --threshold $thresh --count --scoreset=$SCORESET \| sed -e 's/^Reading.*//' -e '/^$/d'	21	./fp-fn-statistics --logfile=masses-validate.log --threshold $thresh --scoreset=$SCORESET
22	echo	22	echo
23	done	23	done
24		24





See the CORPUS_POLICY file for more details.



HOW TO SUBMIT RESULTS BACK TO US
--------------------------------


  This script is used to perform "mass checks" of a set of mailboxes, Cyrus
  folders, and/or MH mail spools.  It generates summary lines like this:

  s s 07.22 /home/jm/Mail/Sapm/1382 SUBJ_ALL_CAPS,SUPERLONG_LINE,SUBJ_FULL_OF_8BITS

  or for mailboxes,

  h h 01.32 /path/to/mbox:<5.1.0.14.2.20011004073932.05f4fd28@localhost> TRACKER_ID,BALANCE_FOR_LONG

  listing the path to the message or its message ID, its score, and the tests
  that triggered on that mail.

  get good hits with few false positives, etc., and re-score the tests to
  optimise the ratio.

  If given the --dist option, this script relies on the spamassassin
  distribution directory living in "..". If this script is not in the
  distribution directory, it will generate logs based on the site-wide
  rules, as well as personal rules.


logs-to-c :

  Takes the "masses.log" file and converts them into C source files
  and simplified data files for use by the C score optimization
  algorithm.  (Called by "make" when you build the perceptron, so
  generally you won't need to run it yourself.)


hit-frequencies :

  Analyses the log files and computes how often each test hits, overall,
  for spam mails and for non-spam.


mk-baseline-results :

  Compute results for the baseline scores (read from ../rules/*).  If you

  It will output statistics on the current ruleset to ../rules/STATISTICS.txt,
  suitable for a release build of SpamAssassin.


perceptron.c :

  Perceptron learner by Henry Stern.  See "README.perceptron" for details.




#!/usr/bin/perl -w
#
# <@LICENSE>
# Copyright 2004 Apache Software Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

=head1 NAME

fp-fn-statistics - Display statistics about the quality of scores

=head1 SYNOPSIS

fp-fn-statistics [options]

  Options: 
    -c,--cffile=path	  Use path as the rules directory
    -s,--scoreset=n	  Use scoreset n
    -l,--logfile=file	  Read in file instead of masses.log
    -t,--threshold=n      Use a spam/ham threshold of n (default: 5)
    --lambda=n            Use a lambda value of n

=head1 DESCRIPTION

B<fp-fn-statistics> first calculates the score each message from a
masses.log would have under a new set of scores. It then aggregates
the number of messages correctly and incorrectly found as spam and
ham, and their average scores.

In addition, B<fp-fn-statistics> determines the "Total Cost Ratio" as
a result of the false positives and negatives mentioned above. This
calculation takes into the value of lambda, which represents the cost
of recovering a false positive, where 1 indicates a message is tagged
only, 9 means the message is mailed back to sender asking for a token
(TMDA style) and 999 means a message is delted. The default, 5,
represents the message being moved to an infrequently read folder.

=cut

use FindBin;
use lib "$FindBin::Bin/../lib";
use Mail::SpamAssassin::Masses;
use Getopt::Long qw(:config bundling auto_help);
use Pod::Usage;
use strict;
use warnings;

use vars qw{$opt_c $opt_l $opt_s $opt_t $opt_lambda};

GetOptions("c|cffile=s@" => \$opt_c,
	   "l|logfile=s" => \$opt_l,
	   "s|scoreset=i" => \$opt_s,
           "t|threshold=f" => \$opt_t,
           "lambda" => \$opt_lambda);

$opt_l ||= "masses.log";

if (!$opt_c || !scalar(@$opt_c)) {
    # Try to read this in from the log, if possible
    open IN, $opt_l or die "Can't open $opt_l: $!";
    my $files = 0; # are we in the files section?
    while(<IN>) {
	if (!$files) {
	    if (/^\# SVN revision:/) {
		$opt_c = [ "$FindBin::Bin/../rules" ];
		last;
	    } elsif (/^\# Using configuration:$/) {
		$files = 1;
	    }
	} elsif (/^\#\s+(.*)\s*$/) {
	    push (@$opt_c, $1);
	} else {
	    # All done!
	    last;
	}
    }

    if (!defined $opt_c) {
      $opt_c = [ "$FindBin::Bin/../rules" ];
    }

    foreach my $file (@$opt_c) {
	die "Can't read $file" unless -r $file;
    }
}

$opt_t = (defined($opt_t) ? $opt_t : 5);
$opt_s ||= 0;
$opt_lambda ||= 5;

my $nybias = 10;


my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
                                               scoreset => $opt_s, # ,,
                                               logfile => $opt_l});

$masses->readlogs();

my $logs = $masses->get_logs();

my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore) = (0,0,0,0,0,0,0,0);

my $num_spam = $masses->get_num_spam();
my $num_ham = $masses->get_num_ham();
my $num_logs = $num_spam + $num_ham;

my $count = 0;

my $score;

foreach my $log (@$logs) {

  $score = 0;
  foreach my $test (@{$log->{tests_hit}}) {

    next if ($test->{issubrule});
    next if (!$test->{score});

    $score += $test->{score};

  }

  if ($score >= $opt_t) {
    if ($log->{isspam}) {
      $ga_yy++;
      $yyscore += $score;
    }
    else {
      $ga_ny++;
      $nyscore += $score;
    }
  } else {
    if ($log->{isspam}) {
      $ga_yn++;
      $ynscore += $score;
    }
    else {
      $ga_nn++;
      $nnscore += $score;
    }
  }
}

$nybias = $nybias * ($num_spam / $num_ham);

my $fprate = ($ga_ny / $num_logs) * 100.0;
my $fnrate = ($ga_yn / $num_logs) * 100.0;

printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_t);
printf "# Correctly non-spam: %6d  %4.2f%%  (%4.2f%% of non-spam corpus)\n", $ga_nn,
  ($ga_nn /  $num_logs) * 100.0, ($ga_nn /  $num_ham) * 100.0;
printf "# Correctly spam:     %6d  %4.2f%%  (%4.2f%% of spam corpus)\n" , $ga_yy,
  ($ga_yy /  $num_logs) * 100.0, ($ga_yy /  $num_spam) * 100.0;
printf "# False positives:    %6d  %4.2f%%  (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny,
  $fprate, ($ga_ny /  $num_ham) * 100.0, $nyscore*$nybias;
printf "# False negatives:    %6d  %4.2f%%  (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn,
  $fnrate, ($ga_yn /  $num_spam) * 100.0, $ynscore;

# convert to the TCR metrics used in the published lit
my $nspamspam = $ga_yy;
my $nspamlegit = $ga_yn;
my $nlegitspam = $ga_ny;
my $nlegitlegit = $ga_yn;
my $nlegit = $num_ham;
my $nspam = $num_spam;

my $werr = ($opt_lambda * $nlegitspam + $nspamlegit)
  / ($opt_lambda * $nlegit + $nspam);

my $werr_base = $nspam
  / ($opt_lambda * $nlegit + $nspam);

$werr ||= 0.000001;     # avoid / by 0
my $tcr = $werr_base / $werr;

my $sr = ($nspamspam / $nspam) * 100.0;
my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
printf "# TCR: %3.6f  SpamRecall: %3.3f%%  SpamPrec: %3.3f%%  FP: %3.2f%%  FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate;





use bytes;

use vars qw {
  $opt_h $opt_m
};

use Getopt::Std;
getopts("f:hmH");

use Getopt::Long qw(:config bundling auto_help);
use Pod::Usage;

GetOptions("m|mass-check" => \$opt_m, "h|H|headers" => \$opt_h);
  useful in combination with mass-check logs and mboxes. If the -m
  option is used, the input should be in \"mass-check\" format (as
  output by mass-check). Use the -H option to just output headers.
";
}

=head1 NAME
my $offset = $ARGV[0];

extract-message-from-mbox - Extract a message from an mbox

=head1 SYNOPSIS

 extract-message-from-mbox [--headers] <mbox>.<offset>
 extract-message-from-mbox --mass-check

 Options:
  -h, --headers       Display only message headers
  -m, --masscheck     Read mass-check output from stdin

=head1 DESCRIPTION

B<extract-message-from-mbox> extracts the message from I<mbox>
starting at the byte offset I<offset>. Very useful in combination with
mass-check logs and mboxes. If the -m or --mass-check option is used,
the input should be in "mass-check" format (as output by
mass-check). Use the -H option to just output headers.

=head1 EXAMPLES

To show messages that hit the rule BAYES_99

grep BAYES_99 masses.log | extract-message-from-mbox -m

To show the message indicated by "/path/to/my/mbox.1234"

extract-message-from-mbox /path/to/my/mbox.1234

=cut



if($opt_m) {
  masscheck();
} else {
  foreach my $message (@ARGV) {
    if ($message =~ /^(.*?)(?:\.(\d+))?$/) {
      extract($1, ($2 || 0));
    }
    else {
      pod2usage("Argument must be of the form <mbox>.<offset>");
    }
  }
}

sub extract {

      $found++ if(/^From /);
      last if($found == 3);
      print;
      last if ($opt_h && /^$/) # empty line? end of headers
    }
  }
}

sub masscheck {
  while (<STDIN>) {
    my $mail = (split(/\s+/, $_))[3];
    $mail =~ tr/_/ /;
    if ($mail =~ /^(.*)\.(\d+)$/) {
      extract($1, $2);




# limitations under the License.
# </@LICENSE>

=head1 NAME
use vars qw($opt_cffile $opt_count $opt_lambda $opt_threshold
		$opt_spam $opt_nonspam);

logs-to-c - Convert a mass-check log into perceptron format
my $argcffile = $opt_cffile;

=head1 SYNOPSIS
if ($opt_count) { $justcount = 1; }

logs-to-c [options]
if (defined $opt_threshold) { $threshold = $opt_threshold; }

 Options:
    -c,--cffile=path	  Use path as the rules directory
    -s,--scoreset=n	  Use scoreset n
    -l,--logfile=file	  Read in file instead of masses.log
    -o,--outputdir        Put output in the specified dir (default tmp/)

=head1 DESCRIPTION

B<logs-to-c> will read the mass-check log F<masses.log> or as
specified by the B<--logfile> option, and convert it into the format
needed by the perceptron. This is a format that is simple for the
perceptron to parse, but is not very readable to humans.
# infrequently-read folder".

By default, output will be put in the directory ./tmp/ unless another
directory is specified by the B<--outputdir> option. (Note: at the
current time, this must be /tmp/ in order for the perceptron to
compile properly.)

=head1 BUGS
my %tests_hit = ();
my %mutable_tests = ();

Please report bugs to http://bugzilla.spamassassin.org/

=head1 SEE ALSO

L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
my ($num_tests, $num_spam, $num_nonspam);
my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore);

=cut
read_ranges();

use FindBin;
use lib "$FindBin::Bin/../lib";
use Mail::SpamAssassin::Masses;
use Getopt::Long qw(:config bundling auto_help);
use Pod::Usage;
use strict;
use warnings;
exit 0;

use vars qw{$opt_c $opt_l $opt_s $opt_o};

GetOptions("c|cffile=s@" => \$opt_c,
	   "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
	   "l|logfile=s" => \$opt_l,
	   "o|output=s" => \$opt_o);

  if ($justcount) {
    $ga_yy = $ga_ny = $ga_yn = $ga_nn = 0;
    $yyscore = $ynscore = $nyscore = $nnscore = 0.0;
  }

$opt_o ||= "./tmp/";
if (!-d $opt_o) {
  mkdir $opt_o, 0777 or die "Can't mkdir $opt_o";
}

$opt_l ||= "masses.log";
      next if /^\#/;
      next if /^$/;
      if($_ !~ /^.\s+([-\d]+)\s+\S+\s*/) { warn "bad line: $_"; next; }
      my $hits = $1;
#my $foo = $_;
      $_ = $'; s/(?:bayes|time)=\S+//; s/,,+/,/g; s/^\s+//; s/\s+$//;

if (!$opt_c || !scalar(@$opt_c)) {
    # Try to read this in from the log, if possible
    open IN, $opt_l or die "Can't open $opt_l: $!";
    my $files = 0; # are we in the files section?
    while(<IN>) {
	if (!$files) {
	    if (/^\# SVN revision:/) {
		$opt_c = [ "$FindBin::Bin/../rules" ];
		last;
	    } elsif (/^\# Using configuration:$/) {
		$files = 1;
	    }
	} elsif (/^\#\s+(.*)\s*$/) {
	    push (@$opt_c, $1);
	} else {
	    # All done!
	    last;
	}
    }

    if (!defined $opt_c) {
      $opt_c = [ "$FindBin::Bin/../rules" ];
    }

    foreach my $file (@$opt_c) {
	die "Can't read $file" unless -r $file;
        } else {
          push (@tests, $tst);
        }
      }

      if (!$justcount) { 
        $tests_hit{$count} = \@tests;
      }

      if ($file eq $opt_spam) {
	$num_spam++;
        if ($justcount) {
          if ($score >= $threshold) {
            $ga_yy++; $yyscore += $score;
          } else {
            $ga_yn++; $ynscore += $score;
          }
        } else {
          $is_spam{$count} = 1;
        }
      } else {
	$num_nonspam++;
        if ($justcount) {
          if ($score >= $threshold) {
#print "$score -- $foo";
            $ga_ny++; $nyscore += $score;
          } else {
            $ga_nn++; $nnscore += $score;
          }
        } else {
          $is_spam{$count} = 0;
        }
      }
      $count++;
    }
    close IN;
  }
  $num_tests = $count;
}

# ignore rules that are subrules -- we don't generate scores for them...

# Note: this will cause a difference over the old logs-to-c since rank
# is dependent on the frequencies of all rules, not just non-subrules
  print "Reading scores from \"$argcffile\"...\n";
  system ("./parse-rules-for-masses -d \"$argcffile\" -s $opt_scoreset") and die;
  require "./tmp/rules.pl";
  %allrules = %rules;           # ensure it stays global
}

my $greprules = sub { return 0 if $_[1]->{issubrule}; return 1; };

$opt_s ||= 0; # |

my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
					       scoreset => $opt_s, # ,,
					       logfile => $opt_l,
                                               greprules => $greprules });

$masses->readlogs();
$masses->do_score_ranges();

my $rules = $masses->get_rules_array();
my $logs = $masses->get_logs();

my @index_to_rule;
my $num_spam = $masses->get_num_spam();
my $num_ham = $masses->get_num_ham();

# This is misleading -- num_tests is really num_msgs
my $num_tests = $num_spam + $num_ham;


# Write logs and scores as C code
writescores_c();
writetests_c();


sub writescores_c {

  my $size = 0;
  my $mutable = 0;
  my $output = '';
  my $count = 0;
  my $score = 0;

  foreach my $rule (sort {($b->{ismutable} <=> $a->{ismutable}) ||
			  ($a->{name} cmp $b->{name}) } @$rules) {

    $score = $rule->{score};
			  ($mutable_tests{$b} <=> $mutable_tests{$a}) ||
			   ($a cmp $b)} (keys %scores);
  my $max_hits_per_msg = 0;
  for ($file = 0; $file < $num_tests; $file++) {
    my(@hits) =
     grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (@{$tests_hit{$file}});
    if ((scalar(@hits)+1) > $max_hits_per_msg) {
      $max_hits_per_msg = scalar(@hits)+1;
    }
  }

    # ignored rules (i.e. no scores)
    next unless $score;
    $rule_to_index{$name} = $i;

    # also ignore rules with score range 0
    next if (!$rule->{range_lo} && !$rule->{range_hi});

    # Set an index
    $rule->{index} = $count;
    $index_to_rule[$count] = $rule; # add the reference to the array

    if ($rule->{ismutable}) {
      $mutable++;
      if ($score > $rule->{range_hi}) {
	$score = $rule->{range_hi} - 0.001;
      } elsif ($score < $rule->{range_lo}) {
	$score = $rule->{range_lo} + 0.001;
      }
      #$range_lo{$name} ||= 0.1;
      #$range_hi{$name} ||= 1.5;
    }
    # These should all be set properly if not mutable
    # score = range_lo = range_hi
    else {
      warn "hi != lo for " . $rule->{name} . "!" if $rule->{range_lo} != $rule->{range_hi};
      $score = $rule->{range_hi} = $rule->{range_lo};
    }

    $output .= "." . $count . "\n" .
         "n" . $rule->{name} . "\n" .
	 "b" . $score . "\n" .
	 "m" . $rule->{ismutable} . "\n" .
	 "l" . $rule->{range_lo} . "\n" .
	 "h" . $rule->{range_hi} . "\n";

    $count++;

  }

  # Output this

  open (DAT, ">$opt_o/scores.data");
  print DAT "N$count\n", "M$mutable\n"; # informational
  print DAT $output;
  close DAT;

  open (OUT, ">$opt_o/scores.h");
  print OUT <<EOF;
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
 
int num_scores = $count;
int num_mutable = $mutable;
unsigned char is_mutable[$count];
double range_lo[$count];
double range_hi[$count];
double bestscores[$count];
char *score_names[$count];
double tmp_scores[$count][2];
unsigned char ny_hit[$mutable];
unsigned char yn_hit[$mutable];
 
double lookup[$mutable];
 
/* readscores() is defined in tests.h */
EOF

";
  close OUT;

  writetests_c($max_hits_per_msg); # make sure $rule_to_index is around
}


sub writetests_c {
  my $max_hits_per_msg = $_[0];

  my $max_hits_per_msg = 0;
  my @goodtests;
  my %uniq_logs;
  my $uniq_key;

  my $i = 0;

  # This will "compress" the logs so that one log entry can have a
  # "count" of n indicating it reprents n similar messages
    my $uniq_key = $is_spam{$file} . " ";

  foreach my $log (@$logs) {
     grep {length($_) && (! $ignored_rule{$_}) &&
	    (defined($rule_to_index{$_}))} (@{ $tests_hit{$file} });

    (@goodtests) = grep {exists($_->{index})} (@{$log->{tests_hit}});
    @goodtests = sort {$a <=> $b} map {$_->{index}} @goodtests;

    if($max_hits_per_msg < scalar(@goodtests)) {
      $max_hits_per_msg = scalar(@goodtests);
    }

    $uniq_key = $log->{isspam} ? "s" : "";
    $uniq_key .= join(" ", @goodtests);


    # The %count_keys hash's entries will be the log info for each unique log
    # $log->{count} is increased to indicate similar logs

    if (exists($uniq_logs{$uniq_key})) {
      $uniq_logs{$uniq_key}->{count}++;
    } else {
      $uniq_logs{$uniq_key} = $log;
      $uniq_logs{$uniq_key}->{count} = 1;
      $uniq_files{$file} = scalar(keys(%count_keys)) - 1;
    }

  }

  my $num_nondup = scalar(keys %uniq_logs);

  open TOP, ">$opt_o/tests.h";
  print TOP <<EOF;
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

int num_tests = $num_tests;
int num_nondup = $num_nondup;
int num_spam = $num_spam;
int num_nonspam = $num_ham;
int max_hits_per_msg = $max_hits_per_msg;
unsigned char num_tests_hit[$num_nondup];
unsigned char is_spam[$num_nondup];

double scores[$num_nondup];
double tmp_total[$num_nondup];
int tests_count[$num_nondup];
EOF


  print TOP join('', <DATA>);
  print TOP $_;
  close TOP;

  open (DAT, ">tmp/tests.data");

  open (DAT, ">$opt_o/tests.data");
    print DAT ".".$uniq_files{$file}."\n";

  my $out;
  my $base_score;
  my $num_tests_hit;

  $i = 0;
  foreach my $log (values %uniq_logs) {
    $out = '';
    $base_score = $num_tests_hit = 0;

    print DAT "." . $i . "\n";

    $out .= "s" . ( ($log->{isspam})? 1 : 0 ) . "\n";

    foreach my $test (@{$log->{tests_hit}}) {
      if (!$test->{score}) {
	# Don't really know why this happens, but the old logs-to-c
	#did it too

	warn "ignored rule " . $test->{name} . " got a hit!";
	next;
      }

      if (!$test->{range_lo} && !$test->{range_hi}) {
	# We ignored this rule
	next;
      }

      # debugging...
      if (!defined $test->{index}) {
	warn "test with no index";

      if ($num_tests_hit >= $max_hits_per_msg) {
	die "Need to increase \$max_hits_per_msg";
      }
      } else {
	$base_score += $scores{$test};
      }
    }

      if ($test->{ismutable}) {
	$num_tests_hit++;
	$out .= "t".$test->{index}."\n";

	if ($num_tests_hit >= $max_hits_per_msg) {
	  die "\$max_hits_per_msg not big enough!";
	}
}

      }
      else {
	$base_score += $test->{score};
      }

  # read ranges, and mutableness, from ranges.data.
  open (IN, "<tmp/ranges.data")
  	or die "need to run score-ranges-from-freqs first!";

  my $count = 0;
  while (<IN>) {
    /^(\S+) (\S+) (\d+) (\S+)$/ or next;
    my $t = $4;
    $range_lo{$t} = $1+0;
    $range_hi{$t} = $2+0;
    my $mut = $3+0;

    if ($allrules{$t}->{issubrule}) {
      $ignored_rule{$t} = 1;
      $mutable_tests{$t} = 0;
      next;
    }
    if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) {
      #warn "ignored rule: score and range == 0: $t\n";
      $ignored_rule{$t} = 1;
      $mutable_tests{$t} = 0;
      next;
    }

    $out .= "b" . $base_score . "\n"; # score to add for non-mutable tests
    $out .= "c" . $log->{count} . "\n"; # number of identical logs
    $count++;

    print DAT "n" . $num_tests_hit . "\n" . $out;
      $mutable_tests{$t} = 0;
    } elsif ($range_lo{$t} == $range_hi{$t}) {
      $mutable_tests{$t} = 0;
    } elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
      $mutable_tests{$t} = 0;
    } else {
      $mutable_tests{$t} = 1;
    }
    unless ($mutable_tests{$t} || $scores{$t}) {
      $ignored_rule{$t} = 1;
    }
  }
  close IN;

    $i++;
  foreach my $t (sort keys %allrules) {
    next if (exists($range_lo{$t}));
    if ($allrules{$t}->{issubrule}) {
      $ignored_rule{$t} = 1;
      $mutable_tests{$t} = 0;
      next;
    }
    $ignored_rule{$t} = 0;
    unless (exists($mutable_tests{$t}) &&
	    ($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) {
      $mutable_tests{$t} = 0;
    }
    unless ($mutable_tests{$t} || $scores{$t}) {
      $ignored_rule{$t} = 1;
    }
    $index_to_rule[$count] = $t;
    $count++;
  }
  foreach my $t (keys %range_lo) {
    next if ($ignored_rule{$t});
    if ($mutable_tests{$t}) {
      if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -1;
      } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -0.01;
      }
      if ($scores{$t} >= $range_hi{$t}) {
	$scores{$t} = $range_hi{$t} - 0.001;
      } elsif ($scores{$t} <= $range_lo{$t}) {
	$scores{$t} = $range_lo{$t} + 0.001;
      }
    } else {
      if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
	next;
      } elsif ($range_lo{$t} == $range_hi{$t}) {
	$scores{$t} = $range_lo{$t};
	next;
      }
      if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -1;
      } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -0.01;
      }
      if ($scores{$t} > $range_hi{$t}) {
	$scores{$t} = $range_hi{$t};
      } elsif ($scores{$t} < $range_lo{$t}) {
	$scores{$t} = $range_lo{$t};
      }
    }
  }
}

  close DAT;
   my $fprate = ($ga_ny / $num_tests) * 100.0;
   my $fnrate = ($ga_yn / $num_tests) * 100.0;

   printf ("\n# SUMMARY for threshold %3.1f:\n", $threshold);
   printf "# Correctly non-spam: %6d  %4.2f%%  (%4.2f%% of non-spam corpus)\n", $ga_nn,
       ($ga_nn /  $num_tests) * 100.0, ($ga_nn /  $num_nonspam) * 100.0;
   printf "# Correctly spam:     %6d  %4.2f%%  (%4.2f%% of spam corpus)\n" , $ga_yy,
       ($ga_yy /  $num_tests) * 100.0, ($ga_yy /  $num_spam) * 100.0;
   printf "# False positives:    %6d  %4.2f%%  (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny,
       $fprate, ($ga_ny /  $num_nonspam) * 100.0, $nyscore*$nybias;
   printf "# False negatives:    %6d  %4.2f%%  (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn,
       $fnrate, ($ga_yn /  $num_spam) * 100.0, $ynscore;

  # convert to the TCR metrics used in the published lit
  my $nspamspam = $ga_yy;
  my $nspamlegit = $ga_yn;
  my $nlegitspam = $ga_ny;
  my $nlegitlegit = $ga_yn;
  my $nlegit = $num_nonspam;
  my $nspam = $num_spam;

  my $werr = ($lambda * $nlegitspam + $nspamlegit)
                  / ($lambda * $nlegit + $nspam);

  my $werr_base = $nspam
                  / ($lambda * $nlegit + $nspam);

  $werr ||= 0.000001;     # avoid / by 0
  my $tcr = $werr_base / $werr;

  my $sr = ($nspamspam / $nspam) * 100.0;
  my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
  printf "# TCR: %3.6f  SpamRecall: %3.3f%%  SpamPrec: %3.3f%%  FP: %3.2f%%  FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate;
}

__DATA__

__DATA__
void loadtests (void) {
  FILE *fin = fopen ("tmp/tests.data", "r");
  char buf[256];


  printf ("Read scores for %d tests.\n", num_scores);
}





my %scores;
my %rulehit;

open(LOGS, "<masses.log");
open(SCORES, "<perceptron.scores");
open(SCORES, "<newscores");

while(<SCORES>)
{


close(SCORES);

while(<LOGS>)
{
    next if /^#/;
    /(.)\s+.\s+[-0-9]*\s+[^\s]+\s+([^\s]*)(\s+?:(?:bayes|time)=\S+)\s*?$/;
    my $class = $1;
    my @rules=split /,/,$2;
    my $score = 0.0;
    foreach $rule (@rules)
    {

	$rulehit{$rule}++;
    }

    if($class eq "s" && $score < 5)
    {
	foreach $rule (@rules)
	{

	}
	$nfn++;
    }
    if($class eq "h" && score >= 5)

close(SPAM);

while(<NONSPAM>)
{
    next if /^#/;
    /.\s+[-0-9]*\s+[^\s]+\s+([^\s]*)\s*$/;
    next unless defined($1);

    my @rules=split /,/,$1;
    my $score = 0.0;
    foreach $rule (@rules)
    {
        next unless (defined ($scores{$rule}));
	$score += $scores{$rule};
	$rulehit{$rule}++;
    }

    if($score >= 5)
    {
	foreach $rule (@rules)
	{
            next unless (defined ($scores{$rule}));

	}
	$nfp++;
    }

}

close(LOGS);

@fpk = sort { $falsepos{$b}/($rulehit{$b}||0.0001) <=> $falsepos{$a}/($rulehit{$a}||0.00001) } keys %falsepos;

print "COMMON FALSE POSITIVES: ($nfp total)\n-----------------------\n\n";




#!/bin/sh -e

cat spam.log | perl -ne's/^Y/s s/; s/^\./s h/; print unless /^\#/;' \
  > spam.log.sorted

cat ham.log | perl -ne's/^Y/h s/; s/^\./h h/; print unless /^\#/;' \
  > ham.log.sorted

# sort by time

echo \# SVN revision: > masses.log

sort --field-separator='=' -n -k2,2 --merge spam.log.sorted ham.log.sorted \
  >> masses.log

  + *




#!/usr/bin/perl -w
#
# <@LICENSE>
# Copyright 2004 Apache Software Foundation
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

# (rough) graphic demo of this algorithm:
# 0.0  = -limit [......] 0 ........ limit
# 0.25 = -limit ..[..... 0 .]...... limit
# 0.5  = -limit ....[... 0 ...].... limit
# 0.75 = -limit ......[. 0 .....].. limit
# 1.0  = -limit ........ 0 [......] limit
my $sliding_window_limits = 4.8; # limits = [-$range, +$range]
my $sliding_window_size =   5.5; # scores have this range within limits

# 0.0  = -limit [......] 0 ........ limit
# 0.25 = -limit ....[... 0 ]....... limit
# 0.5  = -limit ......[. 0 .]...... limit (note: tighter)
# 0.75 = -limit .......[ 0 ...].... limit
# 1.0  = -limit ........ 0 [......] limit
my $shrinking_window_lower_base =   0.00; 
my $shrinking_window_lower_range =  1.00; # *ratio, added to above
my $shrinking_window_size_base =    1.00;
my $shrinking_window_size_range =   1.00; # *ratio, added to above

my $use_sliding_window = 0;

my $argcffile = shift @ARGV;
my $scoreset = shift @ARGV;
$scoreset = 0 if ( !defined $scoreset );

if (defined ($argcffile) && $argcffile eq '-test') {
  # use this to debug the ranking -> score-range mapping:
  for $rat (0.0, 0.25, 0.5, 0.75, 1.0) {
    my ($lo, $hi); if ($use_sliding_window) {
      ($lo, $hi) = sliding_window_ratio_to_range($rat);
    } else {
      ($lo, $hi) = shrinking_window_ratio_to_range($rat);
    }
    warn "test: $rat => [ $lo $hi ]\n";
  } exit;
}

my %freq_spam = ();
my %freq_nonspam = ();

my $num_spam;
my $num_nonspam;
my $num_total;

my %mutable_tests = ();
my %ranking = ();
my %soratio = ();
my %is_nice = ();

if (!defined $argcffile) { $argcffile = "../rules"; }
system ("./parse-rules-for-masses -d \"$argcffile\" -s $scoreset") and die;
if (-e "tmp/rules.pl") {
  # Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem
  require "./tmp/rules.pl";
}
else {
  die "parse-rules-for-masses had no error but no tmp/rules.pl!?!";
}

while (<>) {
  /^\s*([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+\S+\s+(.+)\s*$/ or next;

  my $overall = $1+0;
  my $spam = $2+0;
  my $nonspam = $3+0;
  my $soratio = $4+0;
  my $ranking = $5+0;
  my $test = $6;

  if ($test eq '(all messages)') {
    $num_spam = $spam;
    $num_nonspam = $nonspam;
    $num_total = $spam+$nonspam;
    next;
  }
  next if ($test eq '(all messages as %)');

  if (!defined ($rules{$test})) {
    warn "rule $test no longer exists; ignoring\n";
    next;
  }

  $freq{$test} = $overall;
  $freq_spam{$test} = $spam;
  $freq_nonspam{$test} = $nonspam;

  my $tflags = $rules{$test}->{tflags}; $tflags ||= '';
  if ($tflags =~ /\buserconf\b/ ||
      ( ($scoreset % 2) == 0 && $tflags =~ /\bnet\b/ )) {
    $mutable_tests{$test} = 0;
  } else {
    $mutable_tests{$test} = 1;
  }
  if ($tflags =~ m/\bnice\b/i) {
    $is_nice{$test} = 1;
  } else {
    $is_nice{$test} = 0;
  }

  if ($overall < 0.01) {        # less than 0.01% of messages were hit
    $mutable_tests{$test} = 0;
    $soratio{$test} = 0.5;
    $ranking{$test} = 0.0;
    $rules{$test}->{score} = 0; # tvd - disable these rules automagically

  } else {
    $soratio{$test} = $soratio;
    $ranking{$test} = $ranking;
  }
}

if ( ! mkdir "tmp", 0755 ) {
  warn "Couldn't create tmp directory!: $!\n";
}

open (OUT, ">tmp/ranges.data");
foreach my $test (sort { $ranking{$b} <=> $ranking{$a} } keys %freq) {
  if (!defined ($rules{$test})) {
    warn "no rule $test";
    print OUT ("0 0 0 $test\n");
    next;
  }

  my $overall = $freq{$test};
  my $spam = $freq_spam{$test};
  my $nonspam = $freq_nonspam{$test};
  my $soratio = $soratio{$test};
  my $ranking = $ranking{$test};
  my $mutable = $mutable_tests{$test};

  if (!$mutable || $rules{$test}->{score} == 0) { # didn't look for score 0 - tvd
    printf OUT ("%3.3f %3.3f 0 $test\n",
                         $rules{$test}->{score},
                         $rules{$test}->{score});
    next;
  }

  # 0.0 = best nice, 1.0 = best nonnice
  if ($is_nice{$test}) {
    $ranking = .5 - ($ranking / 2);
  } else {
    $ranking = .5 + ($ranking / 2);
  }

  my ($lo, $hi);
  if ($use_sliding_window) {
    ($lo, $hi) = sliding_window_ratio_to_range($ranking);
  } else {
    ($lo, $hi) = shrinking_window_ratio_to_range($ranking);
  }

  # tvd
  my $tflags = $rules{$test}->{tflags}; $tflags ||= '';
  if ( $is_nice{$test} && ( $ranking < .5 ) ) { # proper nice rule
    if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score # -5.4
      $lo *=1.8;
    }
    elsif ($soratio <= 0.05 && $nonspam > 0.5) { # let good rules be larger if they want to, -4.5
      $lo *= 1.5;
    }

    $hi =	($soratio == 0) ? $lo :
    		($soratio <= 0.005 ) ? $lo/1.1 :
    		($soratio <= 0.010 && $nonspam > 0.2) ? $lo/2.0 :
		($soratio <= 0.025 && $nonspam > 1.5) ? $lo/10.0 :
		0;

    if ( $soratio >= 0.35 ) { # auto-disable bad rules
      ($lo,$hi) = (0,0);
    }
  }
  elsif ( !$is_nice{$test} && ( $ranking >= .5 ) ) { # proper spam rule
    if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score
      $hi *=1.8;
    }
    elsif ( $soratio >= 0.99 && $spam > 1.0 ) {
      $hi *= 1.5; # let good rules be larger if they want to
    }

    $lo =	($soratio == 1) ? $hi:
    		($soratio >= 0.995 ) ? $hi/4.0 :
    		($soratio >= 0.990 && $spam > 1.0) ? $hi/8.0 :
		($soratio >= 0.900 && $spam > 10.0) ? $hi/24.0 :
		0;

    if ( $soratio <= 0.65 ) { # auto-disable bad rules
      ($lo,$hi) = (0,0);
    }
  }
  else { # rule that has bad nice setting
    ($lo,$hi) = (0,0);
  }
  $mutable = 0 if ( $hi == $lo );

  printf OUT ("%3.1f %3.1f $mutable $test\n", $lo, $hi);
}
close OUT;
exit;

sub sliding_window_ratio_to_range {
  my $ratio = shift;
  my $lo = -$sliding_window_limits + ($sliding_window_size * $ratio);
  my $hi = +$sliding_window_limits - ($sliding_window_size * (1-$ratio));
  if ($lo > $hi) { # ???
    ($lo,$hi) = ($hi,$lo);
  }
  ($lo, $hi);
}

sub shrinking_window_ratio_to_range {
  my $ratio = shift;
  my $is_nice = 0;
  my $adjusted = ($ratio -.5) * 2;      # adj [0,1] to [-1,1]
  if ($adjusted < 0) { $is_nice = 1; $adjusted = -$adjusted; }

#$adjusted /= 1.5 if ( $ratio < 0.95 && $ratio > 0.15 ); # tvd

  my $lower = $shrinking_window_lower_base 
                        + ($shrinking_window_lower_range * $adjusted);
  my $range = $shrinking_window_size_base 
                        + ($shrinking_window_size_range * $adjusted);
  my $lo = $lower;
  my $hi = $lower + $range;
  if ($is_nice) {
    my $tmp = $hi; $hi = -$lo; $lo = -$tmp;
  }
  if ($lo > $hi) { # ???
    ($lo,$hi) = ($hi,$lo);
  }

  ($lo, $hi);
}





# limitations under the License.
# </@LICENSE>

use Getopt::Std;
getopts("l:L:h");

use FindBin;
use lib "$FindBin::Bin/../lib";
use Mail::SpamAssassin::Masses;
use Getopt::Long qw(:config bundling auto_help);
use Pod::Usage;
use strict;
use warnings;

use vars qw {
$opt_c $opt_s $opt_l $opt_L $opt_inclang
};

GetOptions("c|cffile=s@" => \$opt_c,
           "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
           "l|logfile=s" => \$opt_l,
           "L|language=s" => \$opt_L,
           "include-language=s" => \$opt_inclang);

    -l LC  also print language specific rules for lang code LC (or 'all')
    -L LC  only print language specific rules for lang code LC (or 'all')

    options -l and -L are mutually exclusive.

my $lower = 1;
#$threshold = 5;
my $higher = 9;
my $min_expected = 2; # Should not be set to more than 5 or less than 2


=head1 NAME
 
find-extremes - Determine which rules are most likely to cause false positives/negatives.
 
=head1 SYNOPSIS
 
hit-frequencies [options]
 
 Options:
    -c,--cffile=path      Use path as the rules directory
    -s,--scoreset=n       Use scoreset n
    -l,--logfile=file     Read in file instead of masses.log
    -L,--language=lc      Only print language specific tests for specified lang code (try 'all')
    --include-language=lc Also print language specific tests for specified lang code (try 'all')
 
=head1 DESCRIPTION

B<hit-frequencies> will read the mass-check log F<masses.log> or the
log given by the B<--logfile> option. By default, B<hit-frequencies>
will assume the proper values for B<--cffile> based on the header of
the masses.log. The output will include the following columns:

=over 4

=item RULE

=item CHISQUARE

=item RATIO_FALSEPOS

=item OVER_FALSEPOS

=item FREQ_OVER

=back

=head1 BUGS

This script may or may not work as designed - it probably needs some
tweaking, and I probably introduced a bug into it while re-writing for
the new Masses stuff. 

=head1 NOTES

This script is poorly documented. Patches welcome.

=cut


$opt_s = 0 unless defined $opt_s;

my $ok_lang = lc ( $opt_inclang || $opt_L || '');
$ok_lang = '.' if ($ok_lang eq 'all');

my $greprules = sub {
  my ($name, $rule) = @_;

  return 0 if (($opt_L && !$rule->{lang}) ||
           ($rule->{lang} &&
            (!$ok_lang || $rule->{lang} !~ /^$ok_lang/i))); # Wrong language

  return 0 if ($rule->{tflags} =~ /\bnet\b/);

  return 1;

};

$opt_l ||= "masses.log";

if (!$opt_c || !scalar(@$opt_c)) {
    # Try to read this in from the log, if possible
    open (IN, $opt_l) or die "Can't open $opt_l: $!";
    my $files = 0; # are we in the files section?
    while(<IN>) {
        if (!$files) {
            if (/^\# SVN revision:/) {
                $opt_c = [ "$FindBin::Bin/../rules" ];
                last;
            } elsif (/^\# Using configuration:$/) {
                $files = 1;
            }
        } elsif (/^\#\s+(.*)\s*$/) {
            push (@$opt_c, $1);
        } else {
            # All done!
            last;
        }
    }

    foreach my $file (@$opt_c) {
        die "Can't read $file" unless -r $file;
    }
}

my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
                                               scoreset => $opt_s,
                                               greprules => $greprules,
                                               logfile => $opt_l,
                                               nologs => 1});

$masses->readrules();
$masses->readlogs();
$higher = 9;
$min_expected = 2; # Should not be set to more than 5 or less than 2

my $rules = $masses->get_rules_hash();
my $logs = $masses->get_logs();

my $num_spam = $masses->get_num_spam();
my $num_ham = $masses->get_num_ham();

my %freq_over_higher_falsepos = (); # how often non-nice found in ones over
                                    # higher threshold that are false positives
my %freq_nonspam = ();	# how often nice found in nonspam
my %freq_under_lower_falseneg = (); # how often nice found in ones under
                                    # lower threshold that are false negatives


my %ratio_expected_falsepos = (); # ratio version of above
my %ratio_expected_falseneg = (); # ditto

my $num_spam = 0;
my $num_nonspam = 0;
my $num_over_higher_falsepos = 0;
my $num_under_lower_falseneg = 0;
my $ok_lang = '';

my %chisquare = ( );
my %prob = ( );

$ok_lang = lc ($opt_l || $opt_L || '');
if ($ok_lang eq 'all') { $ok_lang = '.'; }

foreach my $key (keys %$rules) {

  if ($rules->{$key}->{tflags} !~ /\buserconf\b/) {
    if ($rules->{$key}->{tflags} =~ m/nice/) {
         (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i)
     ) ) {
    delete $rules{$key} ; next;
  }

  if ($rules{$key}->{tflags} =~ m/net/) {
    delete $rules{$key};
    next;
  }
  if ($rules{$key}->{tflags} !~ m/userconf/) {
    if ($rules{$key}->{tflags} =~ m/nice/) {
      $freq_nonspam{$key} = 0;
      $freq_under_lower_falseneg{$key} = 0;
    } else {
      $freq_spam{$key} = 0;
      $freq_over_higher_falsepos{$key} = 0;
    }
  }

}

foreach my $log (@$logs) {

  if($log->{isspam}) {
    # Also need to count plus_hits
    my $plus_hits = 0;
    foreach my $test (@{$log->{tests_hit}}) {
      $plus_hits += $test->{score} if ($test->{score} > 0);
    }

    if(($log->{score} <= $lower) && $plus_hits && $plus_hits >= $lower) {
      $num_under_lower_falseneg++;
      foreach my $test (@{$log->{tests_hit}}) {
	$num_under_lower_falseneg++;
	$freq_under_lower_falseneg{$test->{name}}++ if exists $freq_under_lower_falseneg{$test->{name}};
      }
    }
  }
  else {
    if($log->{score} > $higher) {
      $num_over_higher_falsepos++;
      foreach my $test (@{$log->{tests_hit}}) {
	$num_over_higher_falsepos++;
	$freq_over_higher_falsepos{$test->{name}}++ if exists $freq_over_higher_falsepos{$test->{name}};
      }
    }
  }

}

unless (($num_over_higher_falsepos >= $min_expected)
	&& ($num_under_lower_falseneg >= $min_expected)) {
  die "Insufficient extremes in dataset (" . $num_over_higher_falsepos .

}

my $ratio_falsepos = $num_over_higher_falsepos/$num_spam;
my $ratio_falseneg = $num_under_lower_falseneg/$num_ham;

my $skipped_non_nice = 0;

# non-nice rules
foreach my $rule (keys %freq_over_higher_falsepos) {
  my $expected = $rules->{$rule}->{freq_spam}*$ratio_falsepos;
  if ($expected <= $min_expected) {
    $skipped_non_nice++;
    next;

   $freq_over_higher_falsepos{$rule}/$expected;
  ($chisquare{$rule},$prob{$rule}) =
   chisquare($num_spam,$num_over_higher_falsepos,
	     $rules->{$rule}->{freq_spam},$freq_over_higher_falsepos{$rule});
  if ($freq_over_higher_falsepos{$rule} < $expected) {
    $chisquare{$rule} *= -1;
  }


my $skipped_nice = 0;

# nice rules
foreach my $rule (keys %freq_under_lower_falseneg) {
  my $expected = $rules->{$rule}->{freq_ham}*$ratio_falseneg;
  if ($expected <= $min_expected) {
    $skipped_nice++;
    next;

  $ratio_expected_falseneg{$rule} =
   $freq_under_lower_falseneg{$rule}/$expected;
  ($chisquare{$rule},$prob{$rule}) =
   chisquare($num_ham,$num_under_lower_falseneg,
	     $rules->{$rule}->{freq_ham},$freq_under_lower_falseneg{$rule});
  if ($freq_under_lower_falseneg{$rule} < $expected) {
    $chisquare{$rule} *= -1;
  }


warn "Skipped nice: $skipped_nice\n";

# The rest is copied verbatim from before - its complicated and not
# commented and should work unchanged except for the freq_spam and
# freq_ham stuff and fixing some use strict stuff

my @rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos);

if (scalar(@rules_falsepos)) {
  print "RULE\t\tCHISQUARE\tRATIO_FALSEPOS\tOVER_FALSEPOS\tFREQ_OVER ($num_over_higher_falsepos)\n";
  my(@rules_falsepos_bad) =

	   $over_expected_falsepos{$a}) ||
	    ($freq_over_higher_falsepos{$b} <=>
	     $freq_over_higher_falsepos{$a})} (@rules_falsepos_bad);
    foreach my $rule (@rules_falsepos_bad) {
      print $rule . "\t" . $prob{$rule} . "\t" .
       $ratio_expected_falsepos{$rule} . "\t" .
	$over_expected_falsepos{$rule} . "\t" .

       ($chisquare{$a} <=> $chisquare{$b}) ||
	($ratio_expected_falsepos{$a} <=>
	 $ratio_expected_falsepos{$b}) ||
	  ($rules->{$b}->{freq_spam} <=>
	   $rules->{$a}->{freq_spam})} (@rules_falsepos_good);
    foreach my $rule (@rules_falsepos_good) {
      print $rule . "\t" . $prob{$rule} . "\t" .
       $ratio_expected_falsepos{$rule} . "\t" .
	$over_expected_falsepos{$rule} . "\t" .

  warn "No over-falsepos to print\n";
}

my @rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg);

if (scalar(@rules_falseneg)) {
  print "RULE\t\tCHISQUARE\tRATIO_FALSENEG\tOVER_FALSENEG\tFREQ_UNDER ($num_under_lower_falseneg)\n";

	   $over_expected_falseneg{$a}) ||
	    ($freq_under_lower_falseneg{$b} <=>
	     $freq_under_lower_falseneg{$a})} (@rules_falseneg_bad);
    foreach my $rule (@rules_falseneg_bad) {
      print $rule . "\t" . $prob{$rule} . "\t" .
       $ratio_expected_falseneg{$rule} . "\t" .
	$over_expected_falseneg{$rule} . "\t" .

       ($chisquare{$a} <=> $chisquare{$b}) ||
	($ratio_expected_falseneg{$a} <=>
	 $ratio_expected_falseneg{$b}) ||
	  ($rules->{$b}->{freq_ham} <=>
	   $rules->{$a}->{freq_ham})} (@rules_falseneg_good);
    foreach my $rule (@rules_falseneg_good) {
      print $rule . "\t" . $prob{$rule} . "\t" .
       $ratio_expected_falseneg{$rule} . "\t" .
	$over_expected_falseneg{$rule} . "\t" .

}

exit;

sub readlogs {
  my $spam = $ARGV[0] || "spam.log";
  my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log");


  (open(NONSPAM,$nonspam)) ||
   (die "Couldn't open file '$nonspam': $!; stopped");

  while (defined($line = <NONSPAM>)) {
    if ($line =~ m/^\s*\#/) {
      next;
    } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) {
      my $tests = $1;
      my $hits = 0;
      my(@tests) = ();
      foreach $test (grep {length($_)} (split(/,+/,$tests))) {
	if (exists($rules{$test})) {
	  push @tests, $test;
	  $hits += $rules{$test}->{score};
	}
      }
      
      if (scalar(@tests)) {
	$num_nonspam++;
	foreach $test (grep {exists($freq_nonspam{$_})} (@tests)) {
	  $freq_nonspam{$test}++;
	}
	if ($hits >= $higher) {
	  $num_over_higher_falsepos++;
	  foreach $test (grep
			 {exists($freq_over_higher_falsepos{$_})} (@tests)) {
	    $freq_over_higher_falsepos{$test}++;
	  }
	}
      }
    } elsif ($line =~ m/\S/) {
      chomp($line);
      warn "Can't interpret line '$line'; skipping";
    }
  }

  close(NONSPAM);

  (open(SPAM,$spam)) || (die "Couldn't open file '$spam': $!; stopped");

  while (defined($line = <SPAM>)) {
    if ($line =~ m/^\s*\#/) {
      next;
    } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) {
      my $tests = $1;
      my $hits = 0;
      my $plus_hits = 0;
      my(@tests) = ();
      foreach $test (grep {length($_)} (split(/,+/,$tests))) {
	if (exists($rules{$test})) {
	  push @tests, $test;
	  $hits += $rules{$test}->{score};
	  if ($rules{$test}->{score} > 0) {
	    $plus_hits += $rules{$test}->{score};
	  }
	}
      }
      
      if (scalar(@tests)) {
	$num_spam++;
	foreach $test (grep {exists($freq_spam{$_})} (@tests)) {
	  $freq_spam{$test}++;
	}
	if (($hits <= $lower) && $plus_hits &&
	    ($plus_hits >= $lower)) {
	  $num_under_lower_falseneg++;
	  foreach $test (grep
			 {exists($freq_under_lower_falseneg{$_})} (@tests)) {
	    $freq_under_lower_falseneg{$test}++;
	  }
	}
      }
    } elsif ($line =~ m/\S/) {
      chomp($line);
      warn "Can't interpret line '$line'; skipping";
    }
  }

  close(SPAM);
}


sub readscores {
  system ("./parse-rules-for-masses") and
   die "Couldn't do parse-rules-for-masses: $?; stopped";
  require "./tmp/rules.pl";
}





do
  mkdir tmp/10passrules > /dev/null 2>&1
  cp ../rules/[0-9]*.cf tmp/10passrules
  ./rewrite-cf-with-new-scores -s $SCORESET --old=../rules/50_scores.cf \
	--new=tenpass_results/scores.$run --out=tmp/10passrules/50_scores.cf \
        --cffile=../rules

  ./fp-fn-statistics --cffile=tmp/10passrules \
	--logfile=tenpass_results/masses.log.$run > tmp/stats
	--nonspam=tenpass_results/ham.log.$run > tmp/stats

  grep TCR: tmp/stats
done




#!/bin/sh

# change these!
BASE=logs/
SPBASE=spam-logs
SCORESET="0"

passes="1 2 3 4 5 6 7 8 9 10"
mkdir tenpass_results
mkdir -p ORIG

> make.output


  echo "Training for corpus $id..."
  pwd; date

  > masses.log
  > ORIG/spam-set$SCORESET.log

  echo -n "(using corpora blocks: "
  for notid in $passes ; do
    if [ "$notid" != "$id" ] ; then
      echo -n "$notid "
      cat $BASE/split-$notid.log >> masses.log
      cat $SPBASE/split-$notid.log >> ORIG/spam-set$SCORESET.log
    fi
  done
  echo "for training)"

  make clean >> make.output
  make perceptron 2>&1 >> make.output
  ./perceptron
  pwd; date
  date

  echo "Saving test data for corpus $id..."

  cp $BASE/split-$id.log tenpass_results/masses.log.$id

  cp perceptron.scores tenpass_results/scores.$id

done



Return to bug 2853

Lines 1263-1273 Link Here

(-)lib/Mail/SpamAssassin.pm (-3 / +5 lines)
1263		1263
1264	# read a file called "init.pre" in site rules dir before all others;	1264	# read a file called "init.pre" in site rules dir before all others;
1265	# even the system config.	1265	# even the system config.
		1266
		1267	# Save this in $self so that it can be accessed externally (for logging, etc.)
		1268	$self->{site_rules_filename} \|\|= $self->first_existing_path (@site_rules_path);
1266	my $siterules = $self->{site_rules_filename};	1269	my $siterules = $self->{site_rules_filename};
1267	$siterules \|\|= $self->first_existing_path (@site_rules_path);
1268		1270
		1271	$self->{rules_filename} \|\|= $self->first_existing_path (@default_rules_path);
1269	my $sysrules = $self->{rules_filename};	1272	my $sysrules = $self->{rules_filename};
1270	$sysrules \|\|= $self->first_existing_path (@default_rules_path);
1271		1273
1272	if ($siterules) {	1274	if ($siterules) {
1273	$fname = File::Spec->catfile ($siterules, "init.pre");	1275	$fname = File::Spec->catfile ($siterules, "init.pre");
Lines 1300-1307 Link Here
1300	$self->get_and_create_userstate_dir();	1302	$self->get_and_create_userstate_dir();
1301		1303
1302	# user prefs file	1304	# user prefs file
		1305	$self->{userprefs_filename} \|\|= $self->first_existing_path (@default_userprefs_path);
1303	$fname = $self->{userprefs_filename};	1306	$fname = $self->{userprefs_filename};
1304	$fname \|\|= $self->first_existing_path (@default_userprefs_path);
1305		1307
1306	if (defined $fname) {	1308	if (defined $fname) {
1307	if (!-f $fname && !$self->{dont_copy_prefs} && !$self->create_default_prefs($fname)) {	1309	if (!-f $fname && !$self->{dont_copy_prefs} && !$self->create_default_prefs($fname)) {

Lines 92-98 Link Here

(-)masses/rule-qa/corpus-hourly (-91 / +54 lines)
92	@files = sort readdir(CORPUS);	92	@files = sort readdir(CORPUS);
93	closedir(CORPUS);	93	closedir(CORPUS);
94		94
95	@files = grep { /^(?:spam\|ham)-(?:net-)?\w+\.log$/ && -f "$opt{corpus}/$_" && -M _ < 10 } @files;	95	@files = grep { /^masses-(?:net-)?\w+\.log$/ && -f "$opt{corpus}/$_" && -M _ < 10 } @files;
96	@files = grep {	96	@files = grep {
97	my $time = 0;	97	my $time = 0;
98	my $tag = 0;	98	my $tag = 0;
Lines 109-114 Link Here
109	}	109	}
110	$time;	110	$time;
111	} @files;	111	} @files;
		112
112	}	113	}
113		114
114	sub rename {	115	sub rename {
Lines 158-288 Link Here
158		159
159	next if ($class eq "NET" && $age !~ /^(?:new\|all\|age\|7day)$/);	160	next if ($class eq "NET" && $age !~ /^(?:new\|all\|age\|7day)$/);
160		161
161	my @ham = grep { /^ham/ } @files;	162	print STDERR "logs: " . join(' ', @files) . "\n";
162	my @spam = grep { /^spam/ } @files;
163		163
164	print STDERR "ham: " . join(' ', @ham) . "\n";
165	print STDERR "spam: " . join(' ', @spam) . "\n";
166
167	chdir $opt{corpus};	164	chdir $opt{corpus};
168		165
169	# net vs. local	166	# net vs. local
170	if ($class eq "NET") {	167	if ($class eq "NET") {
171	@ham = grep { /-net-/ } @ham;	168	@files = grep { /-net-/ } @files;
172	@spam = grep { /-net-/ } @spam;	169	print STDERR "logs: " . join(' ', @files) . "\n";
173	print STDERR "ham: " . join(' ', @ham) . "\n";
174	print STDERR "spam: " . join(' ', @spam) . "\n";
175	}	170	}
176	else {	171	else {
177	# if both net and local exist, use newer	172	# if both net and local exist, use newer
178	my %spam;
179	my %ham;
180		173
181	for my $file (@spam) {	174	for my $file (@files) {
182	$spam{$1}++ if ($file =~ m/-(\w+)\.log$/);	175	$logs{$1}++ if ($file =~ m/-(\w+)\.log$/);
183	}	176	}
184	for my $file (@ham) {	177	while (my ($user, $count) = each %logs) {
185	$ham{$1}++ if ($file =~ m/-(\w+)\.log$/);
186	}
187	while (my ($user, $count) = each %ham) {
188	if ($count > 1) {	178	if ($count > 1) {
189	my $nightly = "ham-$user.log";	179	my $nightly = "masses-$user.log";
190	my $weekly = "ham-net-$user.log";	180	my $weekly = "masses-net-$user.log";
191	if ($revision{$nightly} >= $revision{$weekly}) {	181	if ($revision{$nightly} >= $revision{$weekly}) {
192	@ham = grep { $_ ne $weekly } @ham;	182	@files = grep { $_ ne $weekly } @files;
193	}	183	}
194	else {	184	else {
195	@ham = grep { $_ ne $nightly } @ham;	185	@files = grep { $_ ne $nightly } @files;
196	}	186	}
197	}	187	}
198	}	188	}
199	while (my ($user, $count) = each %spam) {	189	print STDERR "logs: " . join(' ', @files) . "\n";
200	if ($count > 1) {
201	my $nightly = "spam-$user.log";
202	my $weekly = "spam-net-$user.log";
203	if ($revision{$nightly} >= $revision{$weekly}) {
204	@spam = grep { $_ ne $weekly } @spam;
205	}
206	else {
207	@spam = grep { $_ ne $nightly } @spam;
208	}
209	}
210	}
211	print STDERR "ham: " . join(' ', @ham) . "\n";
212	print STDERR "spam: " . join(' ', @spam) . "\n";
213	}	190	}
214		191
215	# age	192	# age
216	if ($class eq "NET" && $age ne "7day") {	193	if ($class eq "NET" && $age ne "7day") {
217	@ham = grep { -M "$_" < 10 } @ham;	194	@files = grep { -M "$_" < 10 } @files;
218	@spam = grep { -M "$_" < 10 } @spam;
219	# find most recent CVS revision	195	# find most recent CVS revision
220	my $wanted = 0.0;	196	my $wanted = 0.0;
221	for (@spam, @ham) {	197	for (@spam, @ham) {
222	$wanted = $revision{$_} if ($revision{$_} > $wanted);	198	$wanted = $revision{$_} if ($revision{$_} > $wanted);
223	}	199	}
224	@spam = grep { $revision{$_} eq $wanted } @spam;	200	@files = grep { $revision{$_} eq $wanted } @files;
225	@ham = grep { $revision{$_} eq $wanted } @ham;	201
226	print STDERR "ham: " . join(' ', @ham) . "\n";	202	print STDERR "logs: " . join(' ', @files) . "\n";
227	print STDERR "spam: " . join(' ', @spam) . "\n";
228	}	203	}
229	elsif ($age =~ /^(?:new\|all\|age)$/) {	204	elsif ($age =~ /^(?:new\|all\|age)$/) {
230	@ham = grep { -M "$_" < -M $opt{tagtime} } @ham;	205	@files = grep { -M "$_" < -M $opt{tagtime} } @files;
231	@spam = grep { -M "$_" < -M $opt{tagtime} } @spam;	206
232	@ham = grep { $revision{$_} eq $revision } @ham;	207	@files = grep { $revision{$_} eq $revision } @files;
233	@spam = grep { $revision{$_} eq $revision } @spam;	208
234	print STDERR "ham: " . join(' ', @ham) . "\n";	209	print STDERR "logs: " . join(' ', @files) . "\n";
235	print STDERR "spam: " . join(' ', @spam) . "\n";
236	}	210	}
237	elsif ($age =~ /(\d+)day/) {	211	elsif ($age =~ /(\d+)day/) {
238	my $mtime = $1;	212	my $mtime = $1;
239	@ham = grep { -M "$_" < $mtime } @ham;	213	@files = grep { -M "$_" < $mtime } @files;
240	@spam = grep { -M "$_" < $mtime } @spam;	214
241	print STDERR "ham: " . join(' ', @ham) . "\n";	215	print STDERR "logs: " . join(' ', @files) . "\n";
242	print STDERR "spam: " . join(' ', @spam) . "\n";
243	}	216	}
244		217
245	open(OUT, "> $opt{html}/$class.$age");	218	open(OUT, "> $opt{html}/$class.$age");
246	print OUT "# ham results used: " . join(" ", @ham) . "\n";	219	print OUT "# results used: " . join(" ", @files) . "\n";
247	print OUT "# spam results used: " . join(" ", @spam) . "\n";	220
248	for (@ham) {	221	for (@files) {
249	print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision;	222	print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision;
250	}	223	}
251	for (@spam) {
252	print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision;
253	}
254		224
255	my $flags = "";	225	my $flags = "";
256	$flags = "-t net -s 1" if $class eq "NET";	226	$flags = "-t net -s 1" if $class eq "NET";
257	$flags = "-M HTML_MESSAGE" if $class eq "HTML";	227	$flags = "-M HTML_MESSAGE" if $class eq "HTML";
258		228
259	if ($age eq "all") {	229	if ($age eq "all") {
260	my %spam;	230	my %logs;
261	my %ham;
262	my @output;	231	my @output;
263		232
264	for my $file (@spam) {	233	for my $file (@files) {
265	$spam{$1} = $file if ($file =~ m/-(\w+)\.log$/);	234	$logs{$1} = $file if ($file =~ m/-(\w+)\.log$/);
266	}	235	}
267	for my $file (@ham) {	236
268	$ham{$1} = $file if ($file =~ m/-(\w+)\.log$/);	237	unlink "$opt{tmp}/masses.log.$$";
269	}	238
270	unlink "$opt{tmp}/ham.log.$$";	239	next unless (scalar keys %logs);
271	unlink "$opt{tmp}/spam.log.$$";	240	for my $user (sort keys %logs) {
272	next unless (scalar keys %spam && scalar keys %ham);	241
273	for my $user (sort keys %spam) {
274	next unless defined $ham{$user};
275	chdir "$opt{tree}/masses";	242	chdir "$opt{tree}/masses";
276	system("cat $opt{corpus}/$ham{$user} >> $opt{tmp}/ham.log.$$");	243	system("cat $opt{corpus}/$logs{$user} >> $opt{tmp}/masses.log.$$");
277	system("cat $opt{corpus}/$spam{$user} >> $opt{tmp}/spam.log.$$");	244	open(IN, "./hit-frequencies -xpa $flags -l $opt{corpus}/$logs{$user} \|");
278	open(IN, "./hit-frequencies -xpa $flags $opt{corpus}/$spam{$user} $opt{corpus}/$ham{$user} \|");
279	while(<IN>) {	245	while(<IN>) {
280	chomp;	246	chomp;
281	push @output, "$_:$user\n";	247	push @output, "$_:$user\n";
282	}	248	}
283	close(IN);	249	close(IN);
284	}	250	}
285	open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ \|");	251	open(IN, "./hit-frequencies -xpa $flags -l $opt{tmp}/masses.log.$$ \|");
286	while(<IN>) {	252	while(<IN>) {
287	push @output, $_;	253	push @output, $_;
288	}	254	}
Lines 298-318 Link Here
298	my ($after, $before) = split(/-/, $which);	264	my ($after, $before) = split(/-/, $which);
299	# get and filter logs	265	# get and filter logs
300	chdir $opt{corpus};	266	chdir $opt{corpus};
301	for my $type (("ham", "spam")) {	267
302	open(TMP, "> $opt{tmp}/$type.log.$$");	268	open(TMP, "> $opt{tmp}/masses.log.$$");
303	my @array = ($type eq "ham") ? @ham : @spam;	269	for my $file (@files) {
304	for my $file (@array) {	270	open(IN, $file);
305	open(IN, $file);	271	while (<IN>) {
306	while (<IN>) {	272	print TMP $_ if time_filter($after, $before);
307	print TMP $_ if time_filter($after, $before);	273	}
308	}	274	close(IN);
309	close(IN);
310	}
311	close (TMP);
312	}	275	}
		276	close (TMP);
		277
313	# print out by age	278	# print out by age
314	chdir "$opt{tree}/masses";	279	chdir "$opt{tree}/masses";
315	open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ \|");	280	open(IN, "./hit-frequencies -xpa $flags -l $opt{tmp}/masses.log.$$ \|");
316	while(<IN>) {	281	while(<IN>) {
317	chomp;	282	chomp;
318	push @output, "$_:$which\n";	283	push @output, "$_:$which\n";
Lines 323-335 Link Here
323	print OUT $_;	288	print OUT $_;
324	}	289	}
325	}	290	}
326	elsif (@ham && @spam) {	291	elsif (@files) {
327	# get logs	292	# get logs
328	system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$");	293	system("cat " . join(" ", @files) . " > $opt{tmp}/masses.log.$$");
329	system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$");
330
331	chdir "$opt{tree}/masses";	294	chdir "$opt{tree}/masses";
332	open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ \|");	295	open(IN, "./hit-frequencies -xpa $flags -l $opt{tmp}/masses.log.$$ \|");
333	while(<IN>) {	296	while(<IN>) {
334	print(OUT);	297	print(OUT);
335	}	298	}

Line 0 Link Here

(-)lib/Mail/SpamAssassin/Masses.pm (+788 lines)
		1	# <@LICENSE>
		2	# Copyright 2004 Apache Software Foundation
		3	#
		4	# Licensed under the Apache License, Version 2.0 (the "License");
		5	# you may not use this file except in compliance with the License.
		6	# You may obtain a copy of the License at
		7	#
		8	# http://www.apache.org/licenses/LICENSE-2.0
		9	#
		10	# Unless required by applicable law or agreed to in writing, software
		11	# distributed under the License is distributed on an "AS IS" BASIS,
		12	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		13	# See the License for the specific language governing permissions and
		14	# limitations under the License.
		15	# </@LICENSE>
		16
		17	=head1 NAME
		18
		19	Mail::SpamAssassin::Masses - Interface for reading and parsing rules
		20	and mass-check logs for SpamAssassin
		21
		22	=head1 SYNOPSIS
		23
		24	my $parser = Mail::SpamAssassin::Masses->new();
		25	my $rules = $parser->readrules();
		26	my $logs = $parser->readlogs();
		27
		28	foreach my $test (keys %$rules) {
		29	if ($rules->{$test}->{score} > 1) {
		30	...
		31	}
		32
		33	=head1 DESCRIPTION
		34
		35	Mail::SpamAssassin::Masses is a module to simplify the many scripts
		36	that used to make up the SpamAssassin re-scoring process. By
		37	consolidating all the shared code in one module, the scripts can be
		38	simplified and require fewer temporary files.
		39
		40	=head1 METHODS
		41
		42	=over 4
		43
		44	=cut
		45
		46	package Mail::SpamAssassin::Masses;
		47
		48	use strict;
		49	use warnings;
		50	use Carp;
		51
		52	=item $parser = Mail::SpamAssassin::Masses->new( [ { opt => val, ... } ] );
		53
		54	Construct a new Mail::SpamAssassin::Masses object. You may pass the
		55	following attribute-value pairs to the constructor.
		56
		57	=over 4
		58
		59	=item rulesdir
		60
		61	The directory containing rules. If multiple directories are desired,
		62	an anonymous array should be passed.
		63
		64	=item scoreset
65
66	Scoreset to deal with.
67
68	=item logfile
69
70	Filename of mass-check log.
71
72	=item falses
73
74	Also count frequencies for false positives and false negatives from
75	the logs.
76
77	=item falsesonly
78
79	Only count false positives and false negatives.
80
81	=item greprule
82
83	Coderef that is passed a rule name and a hash ref with the entries
84	containing info about the rule. If the sub returns false, it is skipped.
85
86	=item greplog
87
88	Coderef that is passed a raw log entry. If it returns false, the entry
89	is skipped.
90
91	=item sliding_window
92
93	Use a sliding window for score ranges rather than a shrinking window.
94
95	=item nologs
96
97	Save memory by not saving the individual log results, just the
98	aggregate totals
99
100	=back
101
102	=cut
103
104	sub new {
105
106	my $class = shift;
107	$class = ref($class) \|\| $class;
108
109	my $self = shift;
110	if (!defined $self){
111	$self = { };
112	}
113
114	$self->{scoreset} \|\|= 0;
115	$self->{rulesdir} \|\|= '';
116	$self->{logfile} \|\|= "masses.log";
117
118	bless($self, $class);
119
120	return $self;
121
122	}
123
124	=item $parser->readrules()
125
126	Read and parse the rules from the directory specified as
127	C<rulesdir>. This loads the following keys and values into the hash
128	entry representing the rules (see below).
129
130	=over 4
131
132	=item name
133
134	Contains the rule's name.
135
136	=item score
137
138	Contains the rule's score.
139
140	=item type
141
142	Contains the rule's type (header, body, uri, etc.)
143
144	=item tflags
145
146	Contains the rules tflags (nice, autolearn, etc.) as specified in the config file.
147
148	=item lang
149
150	Set to the value of C<lang> for language-specific tests.
151
152	=item issubrule
153
154	Set to true if the rules is a sub-rule, (i.e. it starts with
155	__). Otherwise, undefined.
156
157	=item isnice
158
159	This key exists and is true if the rule is nice (i.e. with a score
160	that can be below zero).
161
162	=item describe
163
164	Set to the rule's description, in English, or in the rule's language.
165
166	=back
167
168	There may be more values once C<readlogs()> is run.
169
170	=cut
171
172
173	sub readrules {
174
175	my $self = shift;
176
177	$self->{rules} \|\|= { };
178	my $rules = $self->{rules}; # $rules is a reference to the anon hash
179
180	my @dirs = ref($self->{rulesdir}) ? @{$self->{rulesdir}} : $self->{rulesdir};
181
182	my @files;
183
184	foreach my $indir (@dirs) {
185	if (-d $indir) {
186	@files = glob("$indir/*.cf"); # no reason to only do numbered files
187	} else {
188	@files = ( $indir );
189	}
190
191	foreach my $file (@files) {
192	open (IN, "<$file") \|\| croak("Can't open $file, $!");
193	while(<IN>) {
194	s/#.*$//g;
195	s/^\s+//;
196	s/\s+$//;
197	next if /^$/;
198
199	my $lang = '';
200	if (s/^lang\s+(\S+)\s+//) {
201	$lang = lc $1;
202	}
203
204	if (/^(header\|rawbody\|body\|full\|uri\|meta)\s+(\S+)\s+/) {
205	my $type = $1;
206	my $name = $2;
207
208	$rules->{$name} \|\|= { };
209	$rules->{$name}->{name} = $name;
210	$rules->{$name}->{type} = $type;
211	$rules->{$name}->{lang} = $lang if $lang;
212	$rules->{$name}->{tflags} = '';
213
214	if ($name =~ /^__/) {
215	$rules->{$name}->{issubrule} = '1';
216	}
217
218	} elsif (/^describe\s+(\S+)\s+(.+)$/) {
219
220	# Let's get description in english, por favor -- unless the rule isn't english
221
222	next if ($lang && (!$rules->{$1}->{lang} \|\| $rules->{$1}->{lang} ne $lang));
223
224	$rules->{$1} \|\|= { };
225	$rules->{$1}->{describe} = $2;
226
227	} elsif (/^tflags\s+(\S+)\s+(.+)$/) {
228	my $name = $1;
229	$rules->{$name} \|\|= { };
230	$rules->{$name}->{tflags} = $2;
231	if ($2 =~ /nice/) {
232	$rules->{$name}->{isnice} = 1;
233	}
234	} elsif (/^score\s+(\S+)\s+(.+)$/) {
235	my($name,$score) = ($1,$2);
236	$rules->{$name} \|\|= { };
237	if ( $score =~ /\s/ ) { # there are multiple scores
238	($score) = (split(/\s+/,$score))[$self->{scoreset}];
239	}
240	$rules->{$name}->{score} = $score;
241	}
242	}
243	close IN;
244	}
245	}
246	foreach my $rule (keys %{$rules}) {
247	if (!defined $rules->{$rule}->{type}) {
248	delete $rules->{$rule}; # no rule definition -> no rule
249	next;
250	}
251
252	if (!defined $rules->{$rule}->{score}) {
253	my $def = 1.0;
254	if ($rule =~ /^T_/) { $def = 0.01; }
255
256	if ($rules->{$rule}->{isnice}) {
257	$rules->{$rule}->{score} = -$def;
258	} else {
259	$rules->{$rule}->{score} = $def;
260	}
261	}
262
263	if ($self->{greprules} && !&{$self->{greprules}}($rule, $rules->{$rule}))
264	{
265	delete $rules->{$rule};
266	next;
267	}
268
269	}
270
271	$self->{_readrules} = 1;
272	}
273
274	=item $parser->readlogs()
275
276	Read and parse logs from C<logsdir>. This will create the anonymous
277	array of hashes referred to by C<$parser->{logs}>, with the following
278	keys:
279
280	=over 4
281
282	=item isspam
283
284	True if the message is spam. False or undefined otherwise.
285
286	=item isfalse
287
288	True if the message was a false negative or positive.
289
290	=item tests_hit
291
292	Array reference containing references to the hash representing each
293	rule hit.
294
295	=item score
296
297	Score the message received (under current scores).
298
299	=back
300
301	In addition, this method adds the following keys to the rule
302	information in C<$parser->{rules}>.
303
304	=over 4
305
306	=item freq_spam
307
308	Frequency hit in spam.
309
310	=item freq_ham
311
312	Frequency hit in ham.
313
314	=item freq_fp
315
316	Frequency in false positives.
317
318	=item freq_fn
319
320	Frequency in false negatives.
321
322	=back
323
324	Also, sets C<$parser->{num_spam}> and C<$parser->{num_ham}> to the number of
325	spam logs read and the number of ham logs read, respectively.
326
327	=cut
328
329	sub readlogs {
330
331	my $self = shift;
332
333	if (!$self->{_readrules}) {
334	# need to read scores first!
335	$self->readrules();
336	}
337
338	my $rules = $self->{rules}; # copy the ref, shorthand
339
340	my $logs;
341	if (! $self->{nologs}) {
342	$self->{logs} \|\|= [ ];
343	$logs = $self->{logs};
344	}
345
346
347	my ($num_spam, $num_ham, $count, $num_fp, $num_fn);
348	$num_spam = $num_ham = $count = $num_fp = $num_fn = 0;
349
350	# First, initialize stuff
351	foreach my $rule (values %{$self->{rules}}) {
352	$rule->{freq_spam} \|\|= 0;
353	$rule->{freq_ham} \|\|= 0;
354
355	if($self->{falses}) {
356	$rule->{freq_fp} \|\|= 0;
357	$rule->{freq_fn} \|\|= 0;
358	}
359
360	}
361
362	my $file = $self->{logfile};
363	open (IN, "<$file");
364
365	while (<IN>) {
366	next if /^\#/;
367	next if /^$/;
368	if($_ !~ /^(.)\s+(.)\s+-?[\d.]+\s+\S+(\s+\S+\s+)/) { warn "bad line: $_"; next; }
369
370	if ($self->{greplogs} && !&{$self->{greplogs}}($_)) {
371	next;
372	}
373
374	my $manual = $1;
375	my $result = $2;
376	$_ = $3;
377	s/(?:bayes\|time)=\S+//;
378	s/,,+/,/g;
379	s/^\s+//;
380	s/\s+$//;
381
382
383	if ($manual ne $result) {
384	$self->{isfalse} = 1;
385	}
386	elsif ($self->{falsesonly}) {
387	next;
388	}
389
390	if ($manual eq "s") {
391	$num_spam++;
392	$logs->[$count]->{isspam} = 1 unless $self->{nologs};
393	$num_fn++ if $result eq "h";
394	} else {
395	$num_ham++;
396	$num_fp++ if $result eq "s";
397	}
398
399	my @tests = ();
400	my $score = 0;
401	foreach my $tst (split (/,/, $_)) {
402	next if ($tst eq '');
403
404	# Don't count non-existant rules
405	# (Could happen with greprules)
406	next if ( !$rules->{$tst} \|\| !$rules->{$tst}->{type} );
407
408	if ($manual eq "s") {
409	$rules->{$tst}->{freq_spam}++;
410	$rules->{$tst}->{freq_fn}++ if ($self->{falses} && $result eq "h");
411	}
412	else {
413	$rules->{$tst}->{freq_ham}++;
414	$rules->{$tst}->{freq_fp}++ if ($self->{falses} && $result eq "s");
415	}
416
417	$score += $rules->{$tst}->{score};
418
419	push (@tests, $rules->{$tst}) unless $self->{nologs};
420	}
421
422	$logs->[$count]->{tests_hit} = \@tests unless $self->{nologs};
423	$logs->[$count]->{score} = $score;
424
425	$count++;
426	}
427	close IN;
428
429	$self->{num_spam} = $num_spam;
430	$self->{num_ham} = $num_ham;
431	if ($self->{falses}) {
432	$self->{num_fn} = $num_fn;
433	$self->{num_fp} = $num_fp;
434	}
435
436	$self->{_readlogs} = 1; # Done reading logs
437
438	}
439
440	=item $parser->do_statistics();
441
442	Calculate the S/O ratio and the rank for each test.
443
444	This adds the following keys to the rules hashes.
445
446	=over 4
447
448	=item spam_percent
449
450	Percentage of spam messages hit.
451
452	=item ham_percent
453
454	Percentage of ham messages hit.
455
456	=item soratio
457
458	S/O ratio -- percentage of spam messages hit divided by total
459	percentage of messages hit.
460
461	=back
462
463	=cut
464
465	sub do_statistics {
466	my $self = shift;
467
468	if (! $self->{_readlogs} ) {
469	$self->readlogs();
470	}
471
472	my $rank_hi=0;
473	my $rank_lo=999999;
474
475	foreach my $rule (values %{$self->{rules}}) {
476
477	if (!$rule->{freq_spam}) {
478	$rule->{spam_percent} = 0;
479	} else {
480	$rule->{spam_percent} = $rule->{freq_spam} / $self->{num_spam} * 100.0;
481	}
482
483	if (!$rule->{freq_ham}) {
484	$rule->{ham_percent} = 0;
485	} else {
486	$rule->{ham_percent} = $rule->{freq_ham} / $self->{num_ham} * 100.0;
487	}
488
489	if (!$rule->{freq_spam} && !$rule->{freq_ham}) {
490	$rule->{soratio} = 0.5;
491	next;
492	}
493
494	$rule->{soratio} = $rule->{spam_percent} / ($rule->{spam_percent} + $rule->{ham_percent});
495
496	}
497
498	$self->{_statistics} = 1;
499
500	}
501
502	=item $parser->do_rank();
503
504	Calculates the ranking for each rule and stores this in the
505	appropriate key.
506
507	=over 4
508
509	=item rank
510
511	"Rank" of the rule. High numbers are good, low are bad.
512
513	=back
514
515	=cut
516
517	sub do_rank {
518
519	my $self = shift;
520
521	if (! $self->{_statistics} ) {
522	$self->do_statistics();
523	}
524
525	my $rank_hi = 0;
526	my $rank_lo = 9999999;
527
528	my %unwanted;
529	my %wanted;
530	my %wranks = ();
531	my %uranks = ();
532	my $rules = $self->{rules};
533
534
535	foreach my $rule (values %{$self->{rules}}) {
536
537	$wanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_ham} : $rule->{freq_spam};
538	$unwanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_spam} : $rule->{freq_ham};
539
540	$wranks{$wanted{$rule->{name}}} = 1;
541	$uranks{$unwanted{$rule->{name}}} = 1;
542
543	}
544
545	my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
546	my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;
547
548	# first half of ranking is the wanted rank
549	my $position = 0;
550	my $last = undef;
551
552	foreach my $test (@wanted) {
553	$position++ if defined $last && $last != $wanted{$test};
554	$rules->{$test}->{rank} += $position;
555	$last = $wanted{$test};
556	}
557
558	# second half is the unwanted rank
559	$position = 0;
560	$last = undef;
561
562	# Avoid divide by 0 errors!
563	die "Error: no rules read" if (!(scalar keys %uranks));
564
565	my $normalize = (scalar keys %wranks) / (scalar keys %uranks);
566
567	foreach my $test (@unwanted) {
568	$position++ if defined $last && $last != $unwanted{$test};
569	$rules->{$test}->{rank} += ($position * $normalize);
570	$last = $unwanted{$test};
571	$rank_hi = $rules->{$test}->{rank} if ($rules->{$test}->{rank} > $rank_hi);
572	$rank_lo = $rules->{$test}->{rank} if ($rules->{$test}->{rank} < $rank_lo);
573	}
574
575	$rank_hi = $rank_hi - $rank_lo;
576	foreach my $rule (values %{$rules}) {
577	$rule->{rank} = ($rank_hi == 0) ? 0.001 : (($rule->{rank} - $rank_lo)/ $rank_hi);
578	}
579
580	$self->{_rank} = 1;
581	}
582
583	=item $parser->get_rules_array();
584
585	Returns a reference to an array of hash references. The values of
586	these hash have keys as listed above.
587
588	=cut
589
590	sub get_rules_array {
591	my $self = shift;
592	return [ values %{$self->{rules}} ];
593	}
594
595	=item $parser->get_rules_hash();
596
597	Returns a reference to a hash with rule names as keys and hash
598	references as values. The values of these hash have keys as listed
599	above.
600
601	=cut
602
603	sub get_rules_hash {
604	my $self = shift;
605	return $self->{rules};
606	}
607
608	=item $parser->get_logs();
609
610	Returns a reference to the array containing log entries, in the form
611	of anonymous hashes with keys as described above.
612
613	=cut
614
615	sub get_logs {
616	my $self = shift;
617	return $self->{logs};
618	}
619
620	=item $parser->get_num_ham();
621
622	Returns number of ham logs read.
623
624	=cut
625
626	sub get_num_ham {
627	my $self = shift;
628	return $self->{num_ham};
629	}
630
631	=item $parser->get_num_spam();
632
633	Returns number of spam logs read.
634
635	=cut
636
637	sub get_num_spam {
638	my $self = shift;
639	return $self->{num_spam};
640	}
641
642	=item $parser->do_score_ranges();
643
644	Figure out range in which score can be set based on the soratio, etc.
645
646	This is necessary so that the perceptron doesn't set silly
647	scores. (This may not be as much of a problem as it was with the old
648	GA.)
649
650	This adds the following keys to the rules hashes:
651
652	=over 4
653
654	=item ismutable
655
656	Determines whether the perceptron can select a score for this test.
657
658	=item range_lo
659
660	Determines the lowest score the perceptron can set.
661
662	=item range_hi
663
664	Determines the highest score the perceptron can set.
665
666	=cut
667
668	sub do_score_ranges() {
669
670	my $self = shift;
671
672	if ( !$self->{_statistics} ) {
673	$self->do_statistics();
674	}
675	if ( !$self->{_rank} ) {
676	$self->do_rank();
677	}
678
679	foreach my $rule (values %{$self->{rules}}) {
680
681	my ($rank, $lo, $hi);
682
683	$rank = $rule->{rank};
684
685	# Get rid of rules that don't hit -- and disable completely.
686	if ($rule->{spam_percent} + $rule->{ham_percent} < 0.01 \|\|
687	$rule->{score} == 0) {
688
689	$rule->{ismutable} = 0;
690	$rule->{range_lo} = $rule->{range_hi} = 0;
691	next;
692
693	}
694
695	# next: get rid of tests that don't apply in this scoreset
696	# or are userconf -- set ismutable to 0, but keep the score
697	if ($rule->{tflags} =~ /\buserconf\b/ \|\|
698	(($self->{scoreset} % 2) == 0 && $rule->{tflags} =~/\bnet\b/)) {
699
700	$rule->{ismutable} = 0;
701	$rule->{range_lo} = $rule->{range_hi} = $rule->{score};
702	next;
703
704	}
705
706
707	# Normal rules:
708
709	# This seems to convert from [-1,1] to [0,1] but we're already in
710	# [0,1] space - Is this right?
711
712	# The current way ranks are calculated, > 0.5 and < 0.5 have no
713	# special meaning
714
715	# # 0.0 = best nice, 1.0 = best nonnice
716	# if ($rule->{isnice}) {
717	# $rank = .5 - ($rank / 2);
718	# } else {
719	# $rank = .5 + ($rank / 2);
720	# }
721
722	# using this seems to work better
723
724	if($rule->{isnice}) {
725	$hi = 0;
726	$lo = $rule->{rank} * -4.5;
727	} else {
728	$hi = $rule->{rank} * 4.5;
729	$lo = 0
730	}
731
732	# Modify good rules to be lower
733	if ($rule->{isnice}) {
734	if ($rule->{tflags} =~ /\blearn\b/) { # learn rules should get
735	# higher scores (-5.4)
736	$lo *= 1.8;
737	}
738	elsif ( $rule->{soratio} <= 0.05 && $rule->{ham_percent} > 0.5) {
739	$lo *= 1.5;
740	}
741
742	# argh, ugly... but i'm copying it whole...
743	$hi = ($rule->{soratio} == 0) ? $lo :
744	($rule->{soratio} <= 0.005 ) ? $lo/1.1 :
745	($rule->{soratio} <= 0.010 && $rule->{ham_percent} > 0.2) ? $lo/2.0 :
746	($rule->{soratio} <= 0.025 && $rule->{ham_percent} > 1.5) ? $lo/10.0 :
747	0;
748
749	if ($rule->{soratio} >= 0.35 ) {
750	($lo, $hi) = (0,0);
751	}
752	}
753	else { # Make non-nice rules have higher scores if they're good
754	if ($rule->{tflags} =~ /\blearn\b/ ) {
755	$hi *= 1.8;
756	}
757	elsif ( $rule->{soratio} >= 0.99 && $rule->{spam_percent} > 1.0) {
758	$hi *= 1.5;
759	}
760
761	$lo = ($rule->{soratio} == 1) ? $hi:
762	($rule->{soratio} >= 0.995 ) ? $hi/4.0 :
763	($rule->{soratio} >= 0.990 && $rule->{spam_percent} > 1.0) ? $hi/8.0 :
764	($rule->{soratio} >= 0.900 && $rule->{spam_percent} > 10.0) ? $hi/24.0 :
765	0;
766
767	if ($rule->{soratio} <= 0.65 ) { # auto-disable bad rules
768	($lo, $hi) = (0,0);
769	}
770	}
771
772
773	# Some sanity checking
774	if($hi < $lo) {
775	($lo, $hi) = ($hi, $lo);
776	}
777
778
779	$rule->{ismutable} = ($lo == $hi) ? 0 : 1;
780	$rule->{range_lo} = $lo;
781	$rule->{range_hi} = $hi;
782
783	}
784	}
785
786
787	# Pacify perl
788	1;
0	- craig-evolve.scores	789	- craig-evolve.scores
1	+ craig-evolve.scores	790	+ craig-evolve.scores

Lines 1-148 Link Here

(-)masses/parse-rules-for-masses (-148 lines)
1	#!/usr/bin/perl
2	#
3	# <@LICENSE>
4	# Copyright 2004 Apache Software Foundation
5	#
6	# Licensed under the Apache License, Version 2.0 (the "License");
7	# you may not use this file except in compliance with the License.
8	# You may obtain a copy of the License at
9	#
10	# http://www.apache.org/licenses/LICENSE-2.0
11	#
12	# Unless required by applicable law or agreed to in writing, software
13	# distributed under the License is distributed on an "AS IS" BASIS,
14	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15	# See the License for the specific language governing permissions and
16	# limitations under the License.
17	# </@LICENSE>
18
19	sub usage {
20	die "
21	parse-rules-for-masses: parse the SpamAssassin rules files for mass-checks,
22	evolving, and frequency analysis
23
24	usage: ./parse-rules-for-masses [-d rulesdir] [-o outputfile] [-s scoreset]
25
26	rulesdir defaults to ../rules
27	outputfile defaults to ./tmp/rules.pl
28	scoreset default to 0
29
30	";
31	}
32
33	use Getopt::Long;
34	use Data::Dumper;
35
36	use vars qw(@rulesdirs $outputfile $scoreset);
37	GetOptions (
38	"d=s" => \@rulesdirs,
39	"o=s" => \$outputfile,
40	"s=i" => \$scoreset,
41	"help\|h\|?" => sub { usage(); } );
42
43	if ($#rulesdirs < 0) {
44	@rulesdirs = ("../rules");
45	}
46
47	if (!defined $outputfile) {
48	$outputfile = "./tmp/rules.pl";
49	mkdir ("tmp", 0755);
50	}
51
52	$scoreset = 0 if ( !defined $scoreset );
53
54	my $rules = { };
55	readrules(@rulesdirs);
56
57	my $scores = { };
58	foreach my $key (keys %{$rules}) {
59	$scores->{$key} = $rules->{$key}->{score};
60	}
61
62	writerules($outputfile);
63	exit;
64
65	sub readrules {
66	foreach my $indir (@_) {
67	my @files = <$indir/[0-9]*.cf>;
68	my $file;
69	%rulesfound = ();
70	%langs = ();
71	foreach $file (sort @files) {
72	open (IN, "<$file");
73	while (<IN>) {
74	s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;
75
76	my $lang = '';
77	if (s/^lang\s+(\S+)\s+//) {
78	$lang = $1;
79	}
80
81	if (/^(header\|rawbody\|body\|full\|uri\|meta)\s+(\S+)\s+/) {
82	my $type = $1;
83	my $name = $2;
84
85	my $issubrule = '0';
86	if ($name =~ /^__/) { $issubrule = '1'; }
87
88	$rules->{$1} \|\|= { };
89	$rules->{$name}->{type} = $type;
90	$rules->{$name}->{lang} = $lang;
91	$rules->{$name}->{issubrule} = $issubrule;
92	$rules->{$name}->{tflags} = '';
93
94	} elsif (/^describe\s+(\S+)\s+(.+)$/) {
95	$rules->{$1} \|\|= { };
96	$rules->{$1}->{describe} = $2;
97
98	} elsif (/^tflags\s+(\S+)\s+(.+)$/) {
99	$rules->{$1} \|\|= { };
100	$rules->{$1}->{tflags} = $2;
101
102	} elsif (/^score\s+(\S+)\s+(.+)$/) {
103	my($name,$score) = ($1,$2);
104	$rules->{$name} \|\|= { };
105	if ( $score =~ /\s/ ) { # there are multiple scores
106	($score) = (split(/\s+/,$score))[$scoreset];
107	}
108	$rules->{$name}->{score} = $score;
109	}
110	}
111	close IN;
112	}
113	}
114
115	foreach my $rule (keys %{$rules}) {
116	if (!defined $rules->{$rule}->{type}) {
117	delete $rules->{$rule}; # no rule definition -> no rule
118	next;
119	}
120
121	if (!defined $rules->{$rule}->{score}) {
122	my $def = 1.0;
123	if ($rule =~ /^T_/) { $def = 0.01; }
124
125	if ($rules->{$rule}->{tflags} =~ /nice/) {
126	$rules->{$rule}->{score} = -$def;
127	} else {
128	$rules->{$rule}->{score} = $def;
129	}
130	}
131	}
132	}
133
134	sub writerules {
135	my $outfile = shift;
136	# quick hack to create the tmp directory
137	system ("mkdir -p $outfile 2>/dev/null ; rmdir $outfile 2>/dev/null");
138
139	open (OUT, ">$outfile") or die "cannot write to $outfile";
140	print OUT "# dumped at ".`date`."\n";
141
142	$Data::Dumper::Purity = 1;
143	print OUT Data::Dumper->Dump ([$rules, $scores], ['rules', 'scores']);
144
145	print OUT "1;";
146	close OUT;
147	}
148

Lines 16-400 Link Here

(-)masses/hit-frequencies (-312 / +183 lines)
16	# limitations under the License.	16	# limitations under the License.
17	# </@LICENSE>	17	# </@LICENSE>
18		18
		19
19	use FindBin;	20	use FindBin;
20	use Getopt::Std;	21	use lib "$FindBin::Bin/../lib";
21	getopts("fm:M:X:l:L:pxhc:at:s:i");	22	use Mail::SpamAssassin::Masses;
		23	use Getopt::Long qw(:config bundling auto_help);
		24	use Pod::Usage;
		25	use strict;
		26	use warnings;
22		27
		28
23	use vars qw {	29	use vars qw {
24	$opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c	30	$opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c
25	$opt_a $opt_t $opt_s $opt_i $sorting	31	$opt_a $opt_t $opt_s $opt_z $opt_inclang $opt_auto
26	};	32	};
27		33
28	sub usage {	34	GetOptions("c\|cffile=s@" => \$opt_c,
29	die "hit-frequencies [-c rules dir] [-f] [-m RE] [-M RE] [-X RE] [-l LC]	35	"s\|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
30	[-s SC] [-a] [-p] [-x] [-i] [spam log] [ham log]	36	"l\|logfile=s" => \$opt_l,
		37	"f\|falses" => \$opt_f,
		38	"a\|all" => \$opt_a,
		39	"p\|percentages" => \$opt_p,
		40	"x\|extended" => \$opt_x,
		41	"m\|matchrule=s" => \$opt_m, #,
		42	"t\|tflags=s" => \$opt_t,
		43	"M\|matchlog=s" => \$opt_M,
		44	"X\|excludelog=s" => \$opt_X,
		45	"L\|language=s" => \$opt_L,
		46	"include-language=s" => \$opt_inclang);
31		47
32	-c p use p as the rules directory
33	-f falses. count only false-negative or false-positive matches
34	-m RE print rules matching regular expression
35	-t RE print rules with tflags matching regular expression
36	-M RE only consider log entries matching regular expression
37	-X RE don't consider log entries matching regular expression
38	-l LC also print language specific rules for lang code LC (or 'all')
39	-L LC only print language specific rules for lang code LC (or 'all')
40	-a display all tests
41	-p percentages. implies -x
42	-x extended output, with S/O ratio and scores
43	-s SC which scoreset to use
44	-i use IG (information gain) for ranking
45		48
46	options -l and -L are mutually exclusive.	49	=head1 NAME
47		50
48	options -M and -X are not mutually exclusive.	51	hit-frequencies - Display statistics about tests hit by a mass-check run
49		52
50	if either the spam or and ham logs are unspecified, the defaults	53	=head1 SYNOPSIS
51	are \"spam.log\" and \"ham.log\" in the cwd.
52		54
53	";	55	hit-frequencies [options]
54	}
55		56
56	usage() if($opt_h \|\| ($opt_l && $opt_L));	57	Options:
		58	-c,--cffile=path Use path as the rules directory
		59	-s,--scoreset=n Use scoreset n
		60	-l,--logfile=file Read in file instead of masses.log
		61	-f Count only false-positives/false-negatives
		62	-a Report all tests (including subrules)
		63	-p Report percentages instead of raw hits
		64	-x "Extended" output, include RANK, S/O and SCORE
		65	-m,--matchrule=re Print rules matching the regular expression
		66	-t,--tflags=re Print only rules with tflags matching the regular expression
		67	-M,--matchlog=re Consider only logs matching the regular expression
		68	-X,--excludelog=re Exclude logs matching this regular expression
		69	-L,--language=lc Only print language specific tests for specified lang code (try 'all')
		70	--include-language=lc Also print language specific tests for specified lang code (try 'all')
57		71
58	if ($opt_p) {	72	=head1 DESCRIPTION
59	$opt_x = 1;
60	}
61		73
62	$opt_s = 0 if ( !defined $opt_s );	74	B<hit-frequencies> will read the mass-check log F<masses.log> or the
		75	log given by the B<--logfile> option. The output will contain a
		76	summary of the number of ham and spam messages and detailed statistics
		77	for each rule. By default, B<hit-frequencies> will try to guess the
		78	proper values for B<--cffile> based on the header of the
		79	masses.log. The output will include the following columns:
63		80
64	my $cffile = $opt_c \|\| "$FindBin::Bin/../rules";	81	=over 4
65		82
66	my %freq_spam = ();	83	=item OVERALL
67	my %freq_ham = ();
68	my $num_spam = 0;
69	my $num_ham = 0;
70	my %ranking = ();
71	my $ok_lang = '';
72		84
73	readscores($cffile);	85	Number of times (or percentage with B<-p>) the rule hit on
		86	all messages (spam or ham).
74		87
75	$ok_lang = lc ($opt_l \|\| $opt_L \|\| '');	88	=item SPAM
76	if ($ok_lang eq 'all') { $ok_lang = '.'; }
77		89
78	foreach my $key (keys %rules) {	90	Number of times (or percentage with B<-p>) the rule hit on
		91	spam messages.
79		92
80	if ( ($opt_L && !$rules{$key}->{lang}) \|\|	93	=item HAM
81	($rules{$key}->{lang} &&
82	(!$ok_lang \|\| $rules{$key}->{lang} !~ /^$ok_lang/i)
83	) ) {
84	delete $rules{$key} ; next;
85	}
86		94
87	$freq_spam{$key} = 0;	95	Number of times (or percentage with B<-p>) the rule hit on
88	$freq_ham{$key} = 0;	96	ham messages.
89	}
90		97
91	readlogs();	98	=item S/O
92		99
93	my $hdr_all = $num_spam + $num_ham;	100	Shown only with B<-x> or B<-p>, this is the number of spam hits
94	my $hdr_spam = $num_spam;	101	divided by total number of hits (C<S/O> refers to spam divided by
95	my $hdr_ham = $num_ham;	102	overall).
96		103
97	if ($opt_p) {	104	=item RANK
98	my $sorting = $opt_i ? "IG" : "RANK";
99	if ($opt_f) {
100	printf "%7s %7s %7s %6s %6s %6s %s\n",
101	"OVERALL%", "FNEG%", "FPOS%", "S/O", $sorting, "SCORE", "NAME";
102	} else {
103	printf "%7s %7s %7s %6s %6s %6s %s\n",
104	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
105	}
106	printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n",
107	$hdr_all, $hdr_spam, $hdr_ham,
108	soratio ($num_spam,$num_ham), 0, 0;
109		105
110	$hdr_spam = ($num_spam / $hdr_all) * 100.0;	106	Shown only with B<-x> or B<-p>, this is a measure that attempts to
111	$hdr_ham = ($num_ham / $hdr_all) * 100.0;	107	indicate how I<good> or I<useful> a test is. The higher it is, the
112	$hdr_all = 100.0; # this is obvious	108	better the test.
113	printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n",
114	$hdr_all, $hdr_spam, $hdr_ham,
115	soratio ($num_spam,$num_ham), 0, 0;
116		109
117	} elsif ($opt_x) {	110	=item SCORE
118	printf "%7s %7s %7s %6s %6s %6s %s\n",
119	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
120	printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n",
121	$hdr_all, $hdr_spam, $hdr_ham,
122	soratio ($num_spam,$num_ham), 0, 0;
123		111
124	} else {	112	Shown only with B<-x> or B<-p>, this is the current score assigned to
125	printf "%10s %10s %10s %s\n",	113	the rule.
126	"OVERALL", "SPAM", "HAM", "NAME";
127	printf "%10d %10d %10d (all messages)\n",
128	$hdr_all, $hdr_spam, $hdr_ham;
129	}
130		114
131	my %done = ();	115	=item NAME
132	my @tests = ();
133	my $rank_hi = 0;
134	my $rank_lo = 9999999;
135		116
136	# variables for wanted/unwanted RANK	117	This is the rule's name.
137	my %wanted;
138	my %unwanted;
139	my %wranks;
140	my %uranks;
141		118
142	foreach my $test (keys %freq_spam, keys %freq_ham) {	119	=back
143	next unless (exists $rules{$test}); # only valid tests
144	next if (!$opt_a && $rules{$test}->{issubrule});
145		120
146	next if $done{$test}; $done{$test} = 1;	121	=head1 BUGS
147	push (@tests, $test);
148		122
149	my $isnice = 0;	123	Please report bugs to http://bugzilla.spamassassin.org/
150	if ($rules{$test}->{tflags} =~ /nice/) { $isnice = 1; }
151		124
152	my $fs = $freq_spam{$test}; $fs \|\|= 0;	125	=head1 SEE ALSO
153	my $fn = $freq_ham{$test}; $fn \|\|= 0;
154	my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
155	my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;
156		126
157	my $soratio = $soratio{$test} = soratio ($fsadj, $fnadj);	127	L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
158		128
159	if ($isnice) {	129	=cut
160	$soratio = 1.0 - $soratio;
161	my $tmp = $fsadj; $fsadj = $fnadj; $fnadj = $tmp;
162	}
163		130
164	if ($opt_i) {	131	if ($opt_L && $opt_inclang) {
165	# come up with a ranking	132	pod2usage("-L/--language and --include-language are mutually exclusive");
166	my $rank;
167
168	# New new system: from "Learning to Filter Unsolicited Commercial E-Mail",
169	# Ion Androutsopoulos et al: determine the information gain IG(X, C) of the
170	# Boolean attributes (ie. the rules). Measures "the average reduction in
171	# the entropy of C (classification) given the value of X (the rule)". Makes
172	# a good ranking measure with a proper statistical basis. ;)
173	#
174	# Still would like to get an entropy measure in, too.
175	#
176	# sum P(X = x ^ C = c)
177	# IG(X,C) = x in [0, 1] P(X = x ^ C = c) . log2( ------------------- )
178	# c in [Ch, Cs] P(X = x) . P(C = c)
179	#
180	my $safe_nspam = $num_spam \|\| 0.0000001;
181	my $safe_nham = $num_ham \|\| 0.0000001;
182
183	my $num_all = ($num_spam + $num_ham);
184	my $safe_all = $num_all \|\| 0.0000001;
185	my $f_all = $fs+$fn;
186
187	my $px0 = (($num_all - $f_all) / $safe_all); # P(X = 0)
188	my $px1 = ($f_all / $safe_all); # P(X = 1)
189	my $pccs = ($num_spam / $safe_all); # P(C = Cs)
190	my $pcch = ($num_ham / $safe_all); # P(C = Ch)
191	my $px1ccs = ($fs / $safe_nspam); # P(X = 1 ^ C = Cs)
192	my $px1cch = ($fn / $safe_nham); # P(X = 1 ^ C = Ch)
193	my $px0ccs = (($num_spam - $fs) / $safe_nspam); # P(X = 0 ^ C = Cs)
194	my $px0cch = (($num_ham - $fn) / $safe_nham); # P(X = 0 ^ C = Ch)
195	my $safe_px0_dot_pccs = ($px0 * $pccs) \|\| 0.00000001;
196	my $safe_px0_dot_pcch = ($px0 * $pcch) \|\| 0.00000001;
197	my $safe_px1_dot_pccs = ($px1 * $pccs) \|\| 0.00000001;
198	my $safe_px1_dot_pcch = ($px1 * $pcch) \|\| 0.00000001;
199
200	sub log2 { return log($_[0]) / 0.693147180559945; } # log(2) = 0.6931...
201
202	my $safe_px0ccs = ($px0ccs \|\| 0.0000001);
203	my $safe_px0cch = ($px0cch \|\| 0.0000001);
204	my $safe_px1ccs = ($px1ccs \|\| 0.0000001);
205	my $safe_px1cch = ($px1cch \|\| 0.0000001);
206	$rank = ( $px0ccs * log2($safe_px0ccs / $safe_px0_dot_pccs) ) +
207	( $px0cch * log2($safe_px0cch / $safe_px0_dot_pcch) ) +
208	( $px1ccs * log2($safe_px1ccs / $safe_px1_dot_pccs) ) +
209	( $px1cch * log2($safe_px1cch / $safe_px1_dot_pcch) );
210
211	$ranking{$test} = $rank;
212	$rank_hi = $rank if ($rank > $rank_hi);
213	$rank_lo = $rank if ($rank < $rank_lo);
214	}
215	else {
216	# basic wanted/unwanted ranking
217	$wanted{$test} = $isnice ? $fn : $fs;
218	$unwanted{$test} = $isnice ? $fs : $fn;
219	# count number of ranks of each type
220	$wranks{$wanted{$test}} = 1;
221	$uranks{$unwanted{$test}} = 1;
222	}
223	}	133	}
224		134
225	# finish basic wanted/unwanted ranking	135	if ($opt_p) {
226	if (! $opt_i) {	136	$opt_x = 1;
227	my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
228	my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;
229
230	# first half of ranking is the wanted rank
231	my $position = 0;
232	my $last = undef;
233	for my $test (@wanted) {
234	$position++ if defined $last && $last != $wanted{$test};
235	$ranking{$test} += $position;
236	$last = $wanted{$test}
237	}
238
239	# second half of ranking is the unwanted rank
240	my $normalize = (scalar keys %wranks) / (scalar keys %uranks);
241	$position = 0;
242	$last = undef;
243	for my $test (@unwanted) {
244	$position++ if defined $last && $last != $unwanted{$test};
245	$ranking{$test} += ($position * $normalize);
246	$last = $unwanted{$test};
247	$rank_hi = $ranking{$test} if ($ranking{$test} > $rank_hi);
248	$rank_lo = $ranking{$test} if ($ranking{$test} < $rank_lo);
249	}
250	}	137	}
251		138
252	{	139	$opt_s = 0 if ( !defined $opt_s );
253	# now normalise the rankings to [0, 1]
254	$rank_hi -= $rank_lo;
255	foreach $test (@tests) {
256	$ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi);
257	}
258	}
259		140
260	foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) {	141	my $ok_lang = lc ( $opt_inclang \|\| $opt_L \|\| '');
261	next unless (exists $rules{$test}); # only valid tests	142	$ok_lang = '.' if ($ok_lang eq 'all');
262	next if (!$opt_a && $rules{$test}->{issubrule});
263		143
264	my $fs = $freq_spam{$test}; $fs \|\|= 0;	144	my $greprules = sub { # To determine whether rule should be read
265	my $fn = $freq_ham{$test}; $fn \|\|= 0;	145	my ($name, $rule) = @_;
266	my $fa = $fs+$fn;
267		146
268	next if ($opt_m && $test !~ m/$opt_m/); # match certain tests	147	return 0 if ($opt_m && $name !~ /$opt_m/); # name doesn't match -m
269	next if ($opt_t && $rules{$test}->{tflags} !~ /$opt_t/); # match tflags	148	# expression
		149	return 0 if ($opt_t && $rule->{tflags} !~ /$opt_t/); # tflags don't
		150	# match -t
		151	# expression
		152	return 0 if (($opt_L && !$rule->{lang}) \|\|
		153	($rule->{lang} &&
		154	(!$ok_lang \|\| $rule->{lang} !~ /^$ok_lang/i))); # Wrong language
270		155
		156	return 0 if ($rule->{issubrule} && !$opt_a);
		157
271	if (!$opt_a && !$opt_t) {	158	if (!$opt_a && !$opt_t) {
272	next if ($rules{$test}->{tflags} =~ /net/ && ($opt_s % 2 == 0)); # not net tests	159	return 0 if ($rule->{tflags} =~ /net/ && ($opt_s % 2 == 0));
273	next if ($rules{$test}->{tflags} =~ /userconf/); # or userconf	160	return 0 if ($rule->{tflags} =~ /userconf/); # or userconf
274	}	161	}
		162	return 1;
275		163
276	# adjust based on corpora sizes (and cvt to % while we're at it)	164	};
277	my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
278	my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;
279		165
280	if ($opt_f && $fsadj == 0 && $fnadj == 0) { next; }
281		166
282	if ($opt_p) {	167	my $logfile = $opt_l \|\| "masses.log";
283	$fa = ($fa / ($num_spam + $num_ham)) * 100.0;
284	$fs = $fsadj;
285	$fn = $fnadj;
286	}
287		168
288	my $soratio = $soratio{$test};	169	if (!$opt_c \|\| !scalar(@$opt_c)) {
289	if (!defined $soratio) {	170	# Try to read this in from the log, if possible
290	$soratio{$test} = soratio ($fsadj, $fnadj);	171	open IN, $logfile or die "Can't open $logfile: $!";
291	}	172	my $files = 0; # are we in the files section?
		173	while(<IN>) {
		174	if (!$files) {
		175	if (/^\# SVN revision:/) {
		176	$opt_c = [ "$FindBin::Bin/../rules" ];
		177	last;
		178	} elsif (/^\# Using configuration:$/) {
		179	$files = 1;
		180	}
		181	} elsif (/^\#\s+(.)\s$/) {
		182	push (@$opt_c, $1);
		183	} else {
		184	# All done!
		185	last;
		186	}
		187	}
292		188
293	if ($opt_p) {	189	if (!defined $opt_c) {
294	printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f %s\n",	190	$opt_c = [ "$FindBin::Bin/../rules" ];
295	$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test;	191	}
296		192
297	} elsif ($opt_x) {	193	foreach my $file (@$opt_c) {
298	printf "%7d %7d %7d %7.3f %6.2f %6.2f %s\n",	194	die "Can't read $file" unless -r $file;
299	$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test;	195	}
300
301	} else {
302	printf "%10d %10d %10d %s\n", $fa, $fs, $fn, $test;
303	}
304	}	196	}
305	exit;	197
		198	my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
		199	scoreset => $opt_s,
		200	falsesonly => $opt_f,
		201	greprules => $greprules,
		202	logfile => $logfile,
		203	nologs => 1});
306		204
		205	$masses->readrules();
		206	$masses->readlogs();
		207	$masses->do_statistics();
		208	$masses->do_rank();
307		209
		210	my $rules = $masses->get_rules_hash();
		211	my $num_ham = $masses->get_num_ham();
		212	my $num_spam = $masses->get_num_spam();
		213	my $num_all = $num_ham + $num_spam;
308		214
309	sub readlogs {	215	if ($num_ham + $num_spam <= 0) {
310	my $spam = $ARGV[0] \|\| "spam.log";	216	die "Can't run hit-frequencies on 0 messages.";
311	my $ham = $ARGV[1] \|\| (-f "good.log" ? "good.log" : "ham.log");	217	}
312		218
313	foreach my $file ($spam, $ham) {	219	## Write header
314	open (IN, "<$file") \|\| die "Could not open file '$file': $!";
315		220
316	my $isspam = 0; ($file eq $spam) and $isspam = 1;	221	if ($opt_p) {
317		222
318	while (<IN>) {	223	if ($opt_f) {
319	next if (/^#/);	224	printf "%7s %7s %7s %6s %6s %6s %s\n",
320	next unless (!$opt_M \|\| /$opt_M/o);	225	"OVERALL%", "FNEG%", "FPOS%", "S/O", "RANK", "SCORE", "NAME";
321	next if ($opt_X && /$opt_X/o);	226	} else {
		227	printf "%7s %7s %7s %6s %6s %6s %s\n",
		228	"OVERALL%", "SPAM%", "HAM%", "S/O", "RANK", "SCORE", "NAME";
		229	}
322		230
323	/^(.)\s+(-?\d+)\s+(\S+)\s(\S)/ or next;	231	printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n",
324	my $caught = ($1 eq 'Y');	232	$num_all, $num_spam, $num_ham,
325	my $hits = $2;	233	$num_spam / $num_all, 0, 0;
326	$_ = $4; s/,,+/,/g;
327		234
328	if ($isspam) {	235	printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n",
329	if ($opt_f) {	236	100.0, $num_spam / $num_all * 100.0, $num_ham / $num_all * 100.0,
330	if (!$caught) { $num_spam++; }	237	$num_spam / $num_all, 0, 0;
331	} else {
332	$num_spam++;
333	}
334	} else {
335	if ($opt_f) {
336	if ($caught) { $num_ham++; }
337	} else {
338	$num_ham++;
339	}
340	}
341		238
342	my @tests = split (/,/, $_);	239	} elsif ($opt_x) {
343	foreach my $t (@tests) {	240	printf "%7s %7s %7s %6s %6s %6s %s\n",
344	next if ($t eq '');	241	"OVERALL", "SPAM", "HAM", "S/O", "RANK", "SCORE", "NAME";
345	if ($isspam) {	242	printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n",
346	if ($opt_f) {	243	$num_all, $num_spam, $num_ham,
347	if (!$caught) { $freq_spam{$t}++; }	244	$num_spam / $num_all, 0, 0;
348	} else {
349	$freq_spam{$t}++;
350	}
351	} else {
352	if ($opt_f) {
353	if ($caught) { $freq_ham{$t}++; }
354	} else {
355	$freq_ham{$t}++;
356	}
357	}
358	}
359	}
360	close IN;
361	}
362	}
363		245
364		246	} else {
365	sub readscores {	247	printf "%10s %10s %10s %s\n",
366	my($cffile) = @_;	248	"OVERALL", "SPAM", "HAM", "NAME";
367	system ("$FindBin::Bin/parse-rules-for-masses -d \"$cffile\" -s $opt_s") and die;	249	printf "%10d %10d %10d (all messages)\n",
368	require "./tmp/rules.pl";	250	$num_all, $num_spam, $num_ham;
369	}	251	}
370		252
371	sub soratio {	253	foreach my $test (sort { $rules->{$b}->{rank} <=> $rules->{$a}->{rank} } keys %{$rules}) {
372	my ($s, $n) = @_;
373		254
374	$s \|\|= 0;	255	if ($opt_p) {
375	$n \|\|= 0;	256	printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f %s\n",
376		257	($rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham}) / $num_all * 100.0,
377	if ($s + $n > 0) {	258	$rules->{$test}->{spam_percent}, $rules->{$test}->{ham_percent},
378	return $s / ($s + $n);	259	$rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test;
		260	} elsif ($opt_x) {
		261	printf "%7d %7d %7d %7.3f %6.2f %6.2f %s\n",
		262	$rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham},
		263	$rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham},
		264	$rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test;
379	} else {	265	} else {
380	return 0.5; # no results -> not effective	266	printf "%10d %10d %10d %s\n",
		267	$rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham},
		268	$rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham}, $test;
381	}	269	}
382	}	270	}
383		271
384	sub tcr {
385	my ($nspam, $nlegit, $nspamspam, $nlegitspam) = @_;
386	my $nspamlegit = $nspam - $nspamspam;
387	my $nlegitlegit = $nlegit - $nlegitspam;
388
389	my $lambda = 99;
390
391	my $werr = ($lambda * $nlegitspam + $nspamlegit)
392	/ ($lambda * $nlegit + $nspam);
393
394	my $werr_base = $nspam
395	/ ($lambda * $nlegit + $nspam);
396
397	$werr \|\|= 0.000001; # avoid / by 0
398	my $tcr = $werr_base / $werr;
399	return $tcr;
400	}

Line 0 Link Here

(-)masses/perceptron.pod (+30 lines)
	1	=head1 NAME
	2
	3	perceptron - Generate scores for SpamAssassin using the "Stochastic
	4	Gradient Method"
	5
	6	=head1 SYNOPSIS
	7
	8	perceptron [options]
	9
	10	Options:
	11	-p ham_preference Modifies tendency to prefer false negatives over
	12	false positives (default 2.0) (higher = less fp)
	13	-e num_epochs Set number of passes to make (default 15)
	14	-l learning_rate Modifies learning rate (default 2.0)
	15	-w weight_decay Scores multiplied by this value after each pass
	16	to prevent scores from getting too high
	17	(default off (1.0))
	18
	19	=head1 DESCRIPTION
	20
	21	This algorithm is used to optimize SpamAssassin scores, based on the
	22	input given by B<logs-to-c>. At the time of writing, the output of
	23	logs-to-c needs to be compiled into the source before perceptron can
	24	be used, but this will be fixed soon, I hope.
	25
	26	=head1 SEE ALSO
	27
	28	L<logs-to-c(1)>
	29
	30	=cut

Lines 16-47 Link Here

(-)masses/rewrite-cf-with-new-scores (-21 / +114 lines)
16	# limitations under the License.	16	# limitations under the License.
17	# </@LICENSE>	17	# </@LICENSE>
18		18
		19	=head1 NAME
		20
		21	rewrite-cf-with-new-scores - Rewrite SpamAssassin scores file with new
		22	scores.
		23
		24	=head1 SYNOPSIS
		25
		26	rewrite-cf-with-new-scores [options]
		27
		28	Options
		29	--old-scores=file Read file containing the old SpamAssassin scores
		30	--new-scores=file Read file containing the new SpamAssassin scores
		31	-s,--scoreset n Rewrite scoreset n
		32	--output=file Output rewritten score file to file
		33	-c,--cffile=path Use path as the rules directory
		34	-l,--logfile=file Use file instead of masses.log (for guessing -c)
		35
		36	Note: these options can be shortened (i.e. --old, --new, --out) as
		37	long as they are unambiguous.
		38
		39	=head1 DESCRIPTION
		40
		41	B<rewrite-cf-with-new-scores> is a tool to update the sitewide scores
		42	file with the newly generated scores. Since SpamAssassin has four
		43	different scoresets, which each need to be generated separately, this
		44	tool is used to only change the correct scoreset.
		45
		46	By default, the old scores are read from 50_scores.cf in the rules
		47	directory and the new ones from ./perceptron.scores. The output will
		48	be ./50_scores.cf by default.
		49
		50	The rules directory needs to be used to make sure scores are given for
		51	the right tests. Rules not found in the rules directory will not be
		52	given scores in the output.
		53
		54	=head1 BUGS
		55
		56	Please report bugs to http://bugzilla.spamassassin.org/
		57
		58	=head1 SEE ALSO
		59
		60	L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
		61
		62	=cut
		63
		64	use FindBin;
		65	use lib "$FindBin::Bin/../lib";
		66	use Getopt::Long qw(:config bundling auto_help);
		67	use Mail::SpamAssassin::Masses;
		68	use Pod::Usage;
		69	use strict;
		70	use warnings;
		71
		72	use vars qw($opt_old $opt_new $opt_scoreset $opt_out $opt_c $opt_l);
		73
		74	GetOptions("old-scores=s" => \$opt_old,
		75	"new-scores=s" => \$opt_new,
		76	"s\|scoreset=i" => \$opt_scoreset,
		77	"output=s" => \$opt_out,
		78	"c\|cffile=s@" => \$opt_c,
		79	"l\|logfile=s" => \$opt_l);
		80
		81	$opt_l \|\|= "masses.log";
		82	$opt_scoreset = 0 unless defined $opt_scoreset;
83
19	my $NUM_SCORESETS = 4;	84	my $NUM_SCORESETS = 4;
20		85
21	my ($scoreset,$oldscores,$newscores) = @ARGV;	86	if (!$opt_c \|\| !scalar(@$opt_c)) {
		87	# Try to read this in from the log, if possible
		88	open IN, $opt_l or die "Can't open $opt_l: $!";
		89	my $files = 0; # are we in the files section?
		90	while(<IN>) {
		91	if (!$files) {
		92	if (/^\# SVN revision:/) {
		93	$opt_c = [ "$FindBin::Bin/../rules" ];
		94	last;
		95	} elsif (/^\# Using configuration:$/) {
		96	$files = 1;
		97	}
		98	} elsif (/^\#\s+(.)\s$/) {
		99	push (@$opt_c, $1);
		100	} else {
		101	# All done!
		102	last;
		103	}
		104	}
22		105
23	$scoreset = int($scoreset) if defined $scoreset;	106	if (!defined $opt_c) {
24	if (!defined $newscores \|\| $scoreset < 0 \|\| $scoreset >= $NUM_SCORESETS ) {	107	$opt_c = [ "$FindBin::Bin/../rules" ];
25	die "usage: rewrite-cf-with-new-scores scoreset oldscores.cf newscores.cf\n";	108	}
		109
		110	foreach my $file (@$opt_c) {
		111	die "Can't read $file" unless -r $file;
		112	}
26	}	113	}
27		114
28	system ("./parse-rules-for-masses -s $scoreset") and die;	115	if (!$opt_old) {
29	if (-e "tmp/rules.pl") {	116	$opt_old = $$opt_c[0] . "/50_scores.cf";
30	# Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem
31	require "./tmp/rules.pl";
32	}	117	}
33	else {
34	die "parse-rules-for-masses had no error but no tmp/rules.pl!?!";
35	}
36		118
		119	$opt_new \|\|= "50_scores.cf";
		120
		121	my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
		122	scoreset => $opt_scoreset});
		123
		124	$masses->readrules();
		125	my $rules = $masses->get_rules_hash();
		126
37	# now read the generated scores	127	# now read the generated scores
38	my @gascoreorder = ();	128	my @gascoreorder = ();
		129	my %oldscores = ();
39	my %gascorelines = ();	130	my %gascorelines = ();
40	open (STDIN, "<$newscores") or die "cannot open $newscores";	131	open (STDIN, "<$opt_new") or die "cannot open $opt_new";
41	while (<STDIN>) {	132	while (<STDIN>) {
42	/^score\s+(\S+)\s+(-?\d+(?:\.\d+)?)/ or next;	133	/^score\s+(\S+)\s+(-?\d+(?:\.\d+)?)/ or next;
43	my $name = $1; my $score = $2;	134	my $name = $1; my $score = $2;
44	next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);	135	next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule});
45	next if ($name =~ /^__/);	136	next if ($name =~ /^__/);
46	next if ($name eq '(null)'); # er, oops ;)	137	next if ($name eq '(null)'); # er, oops ;)
47		138
Lines 49-55 Link Here
49	push (@gascoreorder, $name);	140	push (@gascoreorder, $name);
50	}	141	}
51		142
52	open (IN, "<$oldscores") or die "cannot open $oldscores";	143	open (IN, "<$opt_old") or die "cannot open $opt_old";
53	my $out = '';	144	my $out = '';
54	my $pre = '';	145	my $pre = '';
55		146
Lines 58-64 Link Here
58	while (<IN>) {	149	while (<IN>) {
59	if (/^\s*score\s+(\S+)\s/) {	150	if (/^\s*score\s+(\S+)\s/) {
60	delete $gascorelines{$1};	151	delete $gascorelines{$1};
61	next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);	152	next unless (exists ($rules->{$1}) && $rules->{$1}->{issubrule} == 0);
62	}	153	}
63	$pre .= $_;	154	$pre .= $_;
64	/^# Start of generated scores/ and last;	155	/^# Start of generated scores/ and last;
Lines 82-91 Link Here
82	if (/^\s*score\s+\S+/) {	173	if (/^\s*score\s+\S+/) {
83	my($score,$name,@scores) = split;	174	my($score,$name,@scores) = split;
84		175
85	next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);	176	next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule});
86	if (defined $gascorelines{$name}) {	177	if (defined $gascorelines{$name}) {
87	# Set appropriate scoreset value	178	# Set appropriate scoreset value
88	$scores[$scoreset] = $gascorelines{$name};	179	$scores[$opt_scoreset] = $gascorelines{$name};
89		180
90	# Create new score line	181	# Create new score line
91	$_ = join(" ","score",$name,generate_scores(@scores))."\n";	182	$_ = join(" ","score",$name,generate_scores(@scores))."\n";
Lines 96-103 Link Here
96	}	187	}
97	close IN;	188	close IN;
98		189
		190	open OUT, ">$opt_out" or die "Can't open $opt_out: $!";
		191
99	# and output the lot	192	# and output the lot
100	print $pre, "\n";	193	print OUT $pre, "\n";
101	foreach my $name (@gascoreorder) {	194	foreach my $name (@gascoreorder) {
102	$_ = $gascorelines{$name};	195	$_ = $gascorelines{$name};
103	next unless (defined ($_));	196	next unless (defined ($_));
Lines 107-118 Link Here
107	@scores = @{$oldscores{$name}} if ( exists $oldscores{$name} );	200	@scores = @{$oldscores{$name}} if ( exists $oldscores{$name} );
108		201
109	# Set appropriate scoreset value	202	# Set appropriate scoreset value
110	$scores[$scoreset] = $_;	203	$scores[$opt_scoreset] = $_;
111		204
112	# Create new score line	205	# Create new score line
113	print join(" ","score",$name,generate_scores(@scores)),"\n";	206	print OUT join(" ","score",$name,generate_scores(@scores)),"\n";
114	}	207	}
115	print "\n", $out, "\n";	208	print OUT "\n", $out, "\n";
116		209
117	sub generate_scores {	210	sub generate_scores {
118	my (@scores) = @_;	211	my (@scores) = @_;

Lines 1-45 Link Here

(-)masses/mboxget (-45 lines)
1	#!/usr/bin/perl -w
2
3	# mboxget - get a message from a mailbox
4	#
5	# usage: mboxget [mass-check-mbox-id ...]
6	#
7	# <@LICENSE>
8	# Copyright 2004 Apache Software Foundation
9	#
10	# Licensed under the Apache License, Version 2.0 (the "License");
11	# you may not use this file except in compliance with the License.
12	# You may obtain a copy of the License at
13	#
14	# http://www.apache.org/licenses/LICENSE-2.0
15	#
16	# Unless required by applicable law or agreed to in writing, software
17	# distributed under the License is distributed on an "AS IS" BASIS,
18	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19	# See the License for the specific language governing permissions and
20	# limitations under the License.
21	# </@LICENSE>
22
23	use strict;
24
25	my $prog = $0;
26	$prog =~ s@.*/@@;
27
28	foreach my $where (@ARGV) {
29	my ($file, $offset) = ($where =~ m/(.*?)(?:\.(\d+))?$/);
30	open(INPUT, $file) \|\| die("$prog: open $file failed: $!\n");
31	if ($offset) {
32	seek(INPUT, $offset, 0) \|\| die("$prog: seek $offset failed: $!\n");
33	}
34	my $past = 0;
35	while (<INPUT>) {
36	if ($past) {
37	last if substr($_,0,5) eq "From ";
38	}
39	else {
40	$past = 1;
41	}
42	print $_;
43	}
44	close INPUT;
45	}

Lines 81-94 Link Here

(-)masses/rule-qa/corpus-nightly (-3 / +3 lines)
81	date > test.end	81	date > test.end
82		82
83	# results name	83	# results name
84	mv spam.log spam-$net$username.log	84	mv masses.log masses-$net$username.log
85	mv ham.log ham-$net$username.log
86		85
87	# rsync	86	# rsync
88	set +e	87	set +e
89	retry=0	88	retry=0
90	while true; do	89	while true; do
91	if rsync -CPcvuzb --timeout=120 spam-$net$username.log ham-$net$username.log $username@rsync.spamassassin.org::corpus/; then	90	if rsync -CPcvuzb --timeout=120 masses-$net$username.log $username@rsync.spamassassin.org::corpus/; then
92	break;	91	break;
93	fi	92	fi
94	if [ $retry -eq 120 ]; then	93	if [ $retry -eq 120 ]; then
Lines 99-101 Link Here
99	sleep 30	98	sleep 30
100	done	99	done
101	set -e	100	set -e
		101

Line 0 Link Here

(-)masses/README.user (+375 lines)
		1
		2	HOW TO GENERATE YOUR OWN SCORES FOR SPAMASSASSN
		3	-----------------------------------------------
		4
		5	Duncan Findlay
		6	<duncf@debian.org>
		7
		8
		9	1. Introduction
		10
		11	One of the reasons SpamAssassin is so accurate is that it's scores are
		12	carefully optimized based on collections (aka. corpus, plural:
		13	corpora) of mail from volunteers all across the world. Each volunteer
		14	uses a script ("mass-check") to run SpamAssassin over each piece of
		15	mail in their corpus. They then submit the results to a central server
		16	where the SpamAssassin development team runs the scoring mechanism to
		17	generate optimal scores.
		18
		19	SpamAssassin uses four different scoresets depending on the options
		20	used. These are almost always referred to by number, as shown below:
		21
		22	Scoreset
		23	0 - Network tests disabled, Bayes disabled
		24	1 - Network tests enabled, Bayes disabled
		25	2 - Network tests disabled, Bayes enabled
		26	3 - Network tests enabled, Bayes enabled
		27
		28	Things are further complicated by the fact that when Bayes is enabled,
		29	it automatically learns using the equivalent scoreset with Bayes
		30	disabled. As a result, optimal scores for scoresets 2 and 3 can only
		31	be generated after scoresets 0 and 1. Set 0 logs can be generated from
		32	set 1 logs, but sets 2 and 3 need to be done separately.
		33
		34	As a result, volunteers who take part in our rescoring survey need to
		35	run 3 mass-checks, each of which can take many hours. Since the
		36	generation of scores is such a labourious process, the SpamAssassin
		37	developers only perform this once per release.
		38
		39	Luckily, the previous score optimizer, a Genetic Algorithm, which took
		40	almost 24 hours to optimize scores for one scoreset has been replaced
		41	with the Perceptron (thanks to Henry Stern) which uses a "Stochastic
		42	Gradient Descent" method. Don't worry if you don't understand what
		43	this means, I certainly don't. The Perceptron takes less than 15
		44	seconds to generate scores of roughly equal quality as the GA.
		45
		46
		47	2. Compiling a Corpus
		48
		49	The first step to generating your own scores it to start collecing
		50	mail, both ham (non-spam) and spam. These should be representative of
		51	all the mail you receive, but you should filter out spam related
		52	lists, like spamassassin-users to avoid skewing results. It is
		53	essential that these corpora be very well classified. It will greatly
		54	reduce the effectiveness of your scores if spam mails get misfiled
		55	into your ham folder and vice versa.
		56
		57	Also, it is important to note that SpamAssassin is not designed to be
		58	a virus filter, so it's best if you filter out viurses from your ham
		59	and spam folders too.
		60
		61	Furthermore, since spam and ham characteristics change over time, it's
		62	best to leave out mail over 6 months. This is especially important for
		63	network tests, since these are designed to stop current spam, and are
		64	not historical records.
65
66	I'm not entirely sure how big corpora should be. The bigger, the
67	better. If your corpus is too small, it may not be sufficiently
68	representative of all the mail you receive, and accuracy will
69	suffer. My corpus of mail for the last 6 months is over 55000 messages
70	(35000 spam, 20000 ham).
71
72
73	3. Mass-check
74
75	Now that you've assembled your corpora, you need to use mass-check to
76	test each message with SpamAssassin. This script is surprisingly fast,
77	as it accesses the internal perl libraries of SpamAssassin, without
78	the need to load a new perl process each time (as you would if you
79	piped each message through spamassassin). Doing a scoreset 2 run (no
80	network, bayes enabled) I get roughly 10,000 messages an hour on an
81	unloaded Pentium 4, 2.80Ghz computer with 512 MB RAM.
82
83	By default, if you are not running out of an unpacked source tree,
84	mass-check will read rules from the usual locations. As a result, you
85	should make sure ~/.spamassassin/user_prefs contains no rules, unless
86	you are planning on using your generated scores for only yourself, not
87	sitewide.
88
89	The first step is to define the locations of all of the messages in
90	your corpora (these are known as "targets"). I find it's easiest to
91	put this in a separate file with line of the following format:
92
93	class:format:location
94
95	Class is either "spam" or "ham", format is "mbox", "file", "dir" or
96	"mbx" and location is the path to the mailbox. mass-check supports
97	using * as a wildcard, so the following target is permitted:
98
99	spam:mbox:/home/duncf/Maildir/Old/spam/*
100
101	Once you have placed all the "targets" necessary for your corpora, run
102	mass-check with the following command.
103
104	mass-check -f file
105
106	If you doing a mass-check run for scoreset 1 or 3 (i.e. network tests
107	enabled) you will also need to add the --net option, and you will want
108	to add -j8 (or some other number) to indicate how many messages to
109	test in parallel. This is useful since a lot of time would otherwise
110	be spent waiting for network queries to return.
111
112	mass-check will generate a log file in the current directory entitled
113	masses.log. This is the log file that will enable us to optimize
114	scores.
115
116	For the impatient: if you're one of those people who want to know
117	exactly how far mass-check has gotten through your mail, use the
118	--showdots option.
119
120
121	4. Checking the quality of your corpora (a.k.a. Pulling Weeds)
122
123	In order to ensure that your corpora don't contain misfiled mails, it
124	is good to double check the highest scoring hams and lowest scoring
125	spams.
126
127	First check ham mail:
128
129	grep "^h" masses.log \| sort -rn -k2,2 \| head -20
130
131	If you want to read the corresponding messages try piping to
132	extract-message-from-mbox -m (see the extract-message-from-mbox
133	section for more detail).
134
135	Do the same with spam mail:
136
137	grep "^s" masses.log \| sort -n -k2,2 \| head -20
138
139
140	5. extract-message-from-mbox
141
142	extract-message-from-mbox takes a mbox filename and a byte offset and
143	outputs the corresponding mail message. With the -m option, mass-check
144	output (i.e. lines from masses.log) is read from the standard
145	input. Without, arguments are expected to be in the form
146	<mbox>.<offset> (i.e. /path/to/mbox.12345)
147
148	The -h option can also be used to only show message headers.
149
150	As shown above, it is quite useful to pipe portions of masses.log to
151	extract-message-from-mbox.
152
153
154	6. hit-frequencies
155
156	hit-frequencies doesn't really help you advance toward your goal of
157	optimizing scores, but it is very useful in evaluating locally created
158	rules. Run it, look at it's output; you'll find it intersting (and if
159	not, feel free to skip to the next section).
160
161	hit-frequencies -x -p -s <scoreset>
162
163	hit-frequencies (and many other scripts) are set to automatically
164	guess where to find your configuration files based on
165	masses.log. Unfortunately, it isn't perfect (actually it's a rather
166	crude hack, but that's irrelevant). You may have to check masses.log
167	to figure out where it's searching and/or add --cffile options (you
168	can specify multiple paths using multiple --cffile options).
169
170	hit-frequencies -x -p generates the following output:
171
172	OVERALL% SPAM% HAM% S/O RANK SCORE NAME
173	64008 40932 23076 0.639 0.00 0.00 (all messages)
174	100.000 63.9483 36.0517 0.639 0.00 0.00 (all messages as %)
175	10.382 16.2342 0.0000 1.000 1.00 3.10 FORGED_MUA_OUTLOOK
176	8.266 12.9263 0.0000 1.000 0.99 1.00 FORGED_OUTLOOK_TAGS
177	6.484 10.1388 0.0000 1.000 0.98 4.50 DRUGS_ERECTILE_OBFU
178	[...]
179
180	The first two rows show the size of the corpora and their ham/spam
181	break down. The following lines list each rule found and give various
182	statistics about it based on your masses.log.
183
184	OVERALL% represents the percentage of total messages (spam and ham)
185	that the rule hits, SPAM% and HAM% show the percentages on each
186	corpus. S/O is the SPAM% divided by the OVERALL%. Generally good
187	(non-nice) rules have S/O's over 0.95, while nice (negative-scoring)
188	rules generally have S/O's less than 0.5. RANK is a human readable
189	indicator of how good a rule is. The higher the better, always. RANK
190	is designed to be a rough indicator of the score the perceptron is
191	likely to give it. SCORE is simply the current score. (This is simply
192	listed for convenience, not calculated in any way.)
193
194	If you do any rule development locally, you will find this is a great
195	tool. If you come up with some great rules (that we haven't already
196	thought of), please send us a patch at
197	http://bugzilla.spamassassin.org/.
198
199
200	7. lint-rules-from-freqs
201
202	This script is designed to read in your masses.log and the
203	SpamAssassin configuration files in order to find both bad syntax and
204	bad rules that hit few messages or (with -f) have too many false
205	positives/negatives, etc.
206
207	lint-rules-from-freqs -f -s <scoreset>
208
209	As with hit-frequencies, it tries to be smart with choosing the right
210	--cffile options.
211
212	This script is roughly the equivalent of running a spamassassin --lint
213	and running a hit-frequencies to determine which tests have bad S/O
214	ratios.
215
216
217	8. logs-to-c
218
219	logs-to-c is the program that converts a mass-check log into code that
220	can be easily used by the perceptron. Currently, it is necessary to
221	use the output of logs-to-c to even compile perceptron, but that
222	should hopefully change in the near future.
223
224	The files logs-to-c create need to be in the tmp/ sub-directory of the
225	directory where perceptron.c is.
226
227	logs-to-c -o tmp/ -s <scoreset>
228
229	These files contain information about each rule such as whether or not
230	the perceptron is permitted to change the rule's score, the range
231	within which the perceptron can adjust it, whether or not a rule is
232	nice, etc. In addition, these files contain information about each
233	mail hit and which tests were hit. The files generated by logs-to-c
234	are not really easy to read, so don't try; use hit-frequencies
235	instead.
236
237
238	9. perceptron
239
240	perceptron is the brains behind the whole process. (And we must of
241	course thank the brain behind perceptron, Henry Stern, for his
242	contribution.)
243
244	While the perceptron takes options for things such as "ham
245	preference", "number of epochs", "learning rate" and "weight decay",
246	it's probably best to trust the defaults; unless of course you want to
247	try to find the optimum parameters (and post them to
248	http://bugzilla.spamassassin.org/ with your evidence).
249
250	The perceptron is incredibly quick. So start it, wait 15 seconds and
251	voila, your optimized scores are ready. The output is in
252	perceptron.scores.
253
254	Unfortunately, it needs to be built from source every time you want to
255	use it with a different masses.log or set of rules. In the directory
256	containing perceptron.c, try:
257
258	make perceptron
259	./perceptron
260
261	If you don't have the Makefile, try
262	gcc -g -O2 -Wall -o perceptron perceptron.c -lm
263	./perceptron
264
265
266	10. rewrite-cf-with-new-scores
267
268	perceptron dumps its results in perceptron.scores. Great. How does
269	that help you? rewrite-cf-with-new-scores takes care of changing the
270	old configuration files to correspons with the new scores. The script
271	takes into account rules found in your configuration, so make sure
272	that the --cffile argument is right (it'll read this from masses.log
273	by default). The syntax is:
274
275	rewrite-cf-with-new-scores --old 50_scores.cf --new perceptron.scores \
276	--out 50_scores.new.cf -l masses.log -s 2
277
278	Make sure you don't forget the -s option. You need to tell it which
279	scoreset to update or it'll update set 0, which is not what you want
280	(unless you just did a set 0 run, of course).
281
282	Note: the statistics in the new scores file are NOT updated. Just the
283	scores are.
284
285	11. fp-fn-statistics
286
287	This script calculates how good the scores are ata given threshold. It
288	returns the number of false positives, false negatives, true
289	positives, true negatives and a whole variety of fun statistics.
290
291	./fp-fn-statistics -s <scoreset> --cffile <path>
292
293	fp-fn-statistics also generates a TCR which is essentially an overall
294	rating of how good the scores are. (This is only accurate when run on
295	a different corpus of mail than that with which the scores were
296	generated). TCR stands for "Total Cost Ratio". The higher the number,
297	the better the set of scores.
298
299
300	12. Submitting corpora for SpamAssassin
301
302	If you want to contribute your mass-check logs to the SpamAssassin
303	rescoring process, please download the latest revision of SpamAssassin
304	from the subversion repository. See this page of the wiki:
305	http://wiki.spamassassin.org/DownloadFromSvn
306
307	You will want to read CORPUS_POLICY and CORPUS_SUBMIT. We only do
308	large rescoring runs just before releases, so be sure to follow the
309	lists which will have more information and reminders on how to
310	participate.
311
312	Please be sure your corpora are of high quality (everything must be
313	carefully checked to avoid misfilings). Also, we appreciate varied
314	sources of mail.
315
316
317	13. Other scripts
318
319	Only a subset of the scripts used in rule development and scoring have
320	been documented here. Most of the others aren't really very
321	useful. You can examine the others by downloading the source from the
322	subversion repository: http://wiki.spamassassin.org/DownloadFromSvn.
323	Everything relating to rule QA and development is in the masses/
324	sub-directory.
325
326	The scripts presented here have had man pages written for them, and an
327	attempt has been made to standardize the options for ease of use. Many
328	of the others may require some reading of source to understand how
329	they work and what they do.
330
331
332	14. Frequently Asked Questions
333
334	(Since this is the first version of this document, I'm guessing what
335	questions would otherwise be asked. So this isn't really a "Frequently
336	Asked Questions" list, but a "What did Duncan fail to address
337	elsewhere?" list.)
338
339	Q. Why don't the scripts automatically guess which scoreset to use like
340	they do with --cffile?
341
342	A. Firstly, mass-check does not know what scoreset
343	you are running. It could guess, but it probably shouldn't. Secondly,
344	the same masses.log can be used for multiple scoresets (a set 1 log
345	can be used to generate scores for sets 0 and 1, by stripping out net
346	rules etc.)
347
348	Q. How can I determine how good the scoring system is?
349
350	A. There is a series of scripts in the source directory (in
351	masses/tenpass/) designed to determine how accurate the perceptron is
352	by using "10-fold Cross Validation" (10fcv). Basically, the masses.log
353	is split into 10 "buckets" and each bucket is sequentially used to
354	validate against scores generated from the remaining 9.
355
356
357	15. Bugs, author, improvements, etc.
358
359	SpamAssassin is written and maintained by a group of developers, whose
360	names can be found in the CREDITS file.
361
362	If you have further questions about SpamAssassin or the rescoring
363	scripts, try the following:
364
365	- Ask on one of the SpamAssassin mailing lists:
366
367	http://www.spamassassin.org/lists.html
368
369	- If you've found a bug, file a report:
370
371	http://bugzilla.spamassassin.org/
372
373	- Also, check out our wiki:
374
375	http://wiki.spamassassin.org/

Lines 1-47 Link Here

(-)masses/runGA (-24 / +21 lines)
1	#!/bin/sh	1	#!/bin/sh
2		2
3	SCORESET="0"	3	SCORESET="0"
		4	if [ "x$1" != "x" ] ; then
		5	SCORESET=$1
		6	fi
		7
4	NAME="set$SCORESET"	8	NAME="set$SCORESET"
		9	BASE="logs"
5		10
6	if [ ! -f "ORIG/ham-$NAME.log" -o ! -f "ORIG/spam-$NAME.log" ]; then	11	if [ ! -f "ORIG/masses-$NAME.log" ]; then
7	echo "Couldn't find logs for $NAME" >&2	12	echo "Couldn't find logs for $NAME" >&2
8	exit 1	13	exit 1
9	fi	14	fi
10		15
11	if [ "x$1" = "x" ]; then	16	if [ "x$2" = "x" ]; then
12	echo "[Doing a scoreset $SCORESET score-generation run]"	17	echo "[Doing a scoreset $SCORESET score-generation run]"
13		18
14	# Clean out old runs	19	# Clean out old runs
15	echo "[Cleaning up]"	20	echo "[Cleaning up]"
16	rm -rf spam-validate.log nonspam-validate.log ham-validate.log spam.log nonspam.log ham.log NSBASE SPBASE tmp make.output freqs perceptron.scores \	21
17	gen-$NAME.out gen-$NAME.scores gen-$NAME.validate	22	rm -rf masses-validate.log masses.log $BASE tmp make.output freqs \
		23	perceptron.scores gen-$NAME.out gen-$NAME.scores gen-$NAME.validate
18	make clean >/dev/null	24	make clean >/dev/null
19		25
20	# Generate 90/10 split logs	26	# Generate 90/10 split logs
21	echo "[Generating 90/10 split ham]"	27	echo "[Generating 90/10 split ham]"
22	mkdir NSBASE SPBASE	28	mkdir $BASE
23	cd NSBASE	29	cd $BASE
24	../tenpass/split-log-into-buckets 10 < ../ORIG/ham-$NAME.log > /dev/null	30	../tenpass/split-log-into-buckets 10 < ../ORIG/masses-$NAME.log > /dev/null
25	cat split-[1-9].log > nonspam.log	31	cat split-[1-9].log > masses.log
26	rm -f split-[1-9].log	32	rm -f split-[1-9].log
27	mv split-10.log nonspam-validate.log	33	mv split-10.log masses-validate.log
28		34
29	echo "[Generating 90/10 split spam]"
30	cd ../SPBASE
31	../tenpass/split-log-into-buckets 10 < ../ORIG/spam-$NAME.log > /dev/null
32	cat split-[1-9].log > spam.log
33	rm -f split-[1-9].log
34	mv split-10.log spam-validate.log
35	cd ..	35	cd ..
36		36
37	echo "[Setting up for gen run]"	37	echo "[Setting up for gen run]"
38	# Ok, setup for a run	38	# Ok, setup for a run
39	ln -s SPBASE/spam.log .	39	ln -s $BASE/masses.log .
40	ln -s NSBASE/nonspam.log .	40	ln -s $BASE/masses-validate.log .
41	ln -s NSBASE/nonspam.log ham.log
42	ln -s SPBASE/spam-validate.log .
43	ln -s NSBASE/nonspam-validate.log .
44	ln -s NSBASE/nonspam-validate.log ham-validate.log
45		41
46	# try to find number of processors	42	# try to find number of processors
47	numcpus=`cpucount 2>/dev/null \|\| egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null \|\| echo 1`	43	numcpus=`cpucount 2>/dev/null \|\| egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null \|\| echo 1`
Lines 57-67 Link Here
57		53
58	else	54	else
59		55
		56	echo "Make sure 50_scores.cf has been replaced appropriately"
		57
60	# This needs to have 50_scores.cf in place first ...	58	# This needs to have 50_scores.cf in place first ...
61	echo "[gen validation results]"	59	echo "[gen validation results]"
62	./logs-to-c --spam=SPBASE/spam-validate.log \	60	./fp-fn-statistics --logfile=BASE/masses-validate.log \
63	--nonspam=NSBASE/nonspam-validate.log \	61	--cffile=../rules --scoreset=$SCORESET \| tee gen-$NAME.validate
64	--count --cffile=../rules --scoreset=$SCORESET \| tee gen-$NAME.validate
65		62
66	echo "[STATISTICS file generation]"	63	echo "[STATISTICS file generation]"
67	./mk-baseline-results $SCORESET \| tee gen-$NAME.statistics	64	./mk-baseline-results $SCORESET \| tee gen-$NAME.statistics

Lines 16-139 Link Here

(-)masses/lint-rules-from-freqs (-248 / +159 lines)
16	# limitations under the License.	16	# limitations under the License.
17	# </@LICENSE>	17	# </@LICENSE>
18		18
		19	=head1 NAME
		20
		21	lint-rules-from-freqs - Try to find problems with SpamAssassin rules
		22
		23	=head1 SYNOPSIS
		24
		25	lint-rules-from-freqs [options]
		26
		27	Options:
		28	-c,--cffile=path Use path as the rules directory
		29	-s,--scoreset=n Use scoreset n
		30	-l,--logfile=file Read in file instead of masses.log
		31	-f Also take into account false positives/negatives
		32
		33	=head1 DESCRIPTION
		34
		35	This script analyzes SpamAssassin tests, based on the hit frequencies
		36	and S/O ratios from a mass-check log (masses.log). This script can
		37	also optionally take into account the false positive/negative
		38	frequencies.
		39
		40	The script first uses the SpamAssassin rules parser to report on any
		41	illegal syntax. Then it checks the rules match frequencies from the
		42	mass-check log in order to determine how effective the rule is.
		43
		44	=head1 BUGS
		45
		46	Please report bugs to http://bugzilla.spamassassin.org/
		47
		48	=head1 SEE ALSO
		49
		50	L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
		51
		52	=cut
		53
		54
		55	use FindBin;
		56	use lib "$FindBin::Bin/../lib";
		57	use Mail::SpamAssassin::Masses;
		58	use Mail::SpamAssassin;
		59	use Getopt::Long qw(:config bundling auto_help);
		60	use strict;
		61	use warnings;
		62
19	# any tests that get less than this % of matches on both spam or nonspam, are	63	# any tests that get less than this % of matches on both spam or nonspam, are
20	# reported.	64	# reported.
21	my $LOW_MATCHES_PERCENT = 0.03;	65	my $LOW_MATCHES_PERCENT = 0.03;
22	my $scoreset = 0;
23		66
24	sub usage {	67	use vars qw($opt_c $opt_l $opt_s $opt_f $opt_p);
25	die "
26	lint-rules-from-freqs: perform 'lint' testing on SpamAssassin rules and scores
27		68
28	usage: ./lint-rules-from-freqs [-f falsefreqs] < freqs > badtests	69	GetOptions("c\|cffile=s@" => \$opt_c,
		70	"s\|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
		71	"l\|logfile=s" => \$opt_l,
		72	"f\|falses" => \$opt_f);
29		73
30	This analyzes SpamAssassin tests, based on the hit frequencies and S/O ratios
31	from a mass-check logfile pair.
32		74
33	The 'freqs' argument is the frequency of hits in all messages ('hit-frequencies	75	$opt_s = 0 unless defined $opt_s;
34	-x -p' output).	76	$opt_l \|\|= "masses.log";
35		77
36	The 'falsefreqs' argument is frequencies of hits in false-positives and	78	if (!$opt_c \|\| !scalar(@$opt_c)) {
37	false-negatives only ('hit-frequencies -x -p -f' output).	79	# Try to read this in from the log, if possible
		80	open IN, $opt_l or die "Can't open $opt_l: $!";
		81	my $files = 0; # are we in the files section?
		82	while(<IN>) {
		83	if (!$files) {
		84	if (/^\# SVN revision:/) {
		85	$opt_c = [ "$FindBin::Bin/../rules" ];
		86	last;
		87	} elsif (/^\# Using configuration:$/) {
		88	$files = 1;
		89	}
		90	} elsif (/^\#\s+(.)\s$/) {
		91	push (@$opt_c, $1);
		92	} else {
		93	# All done!
		94	last;
		95	}
		96	}
38		97
39	";	98	if (!defined $opt_c) {
40	}	99	$opt_c = [ "$FindBin::Bin/../rules" ];
		100	}
41		101
42	my $opt_falsefreqs;	102	foreach my $file (@$opt_c) {
43	while ($#ARGV >= 0) {	103	die "Can't read $file" unless -r $file;
44	$_ = shift @ARGV;	104	}
45	if (/^-f/) { $_ = shift @ARGV; $opt_falsefreqs = $_; }
46	elsif (/^-s/) { $_ = shift @ARGV; $scoreset = $_; }
47	else { usage(); }
48	}	105	}
49		106
50	print "BAD TESTS REPORT\n";	107	print "BAD TESTS REPORT\n";
51	readrules();	108	# First, do a --lint
52	print "\n" .((scalar keys %rulefile) + 1). " rules found.\n";	109
53	print "\nRule file syntax issues:\n\n";	110	print "\nRule file syntax issues:\n\n";
54	lintrules();
55		111
56	if ($opt_falsefreqs) {	112	{
57	open (FALSE, "<$opt_falsefreqs");	113	local (STDERR) = \STDOUT; # Get lint errors on STDOUT
58	while (<FALSE>) {	114
59	if (!/^\s*([\d\.]+)/) {	115	# Read the config ourselves...
60	my ($overall, $spam, $nons, $so, $score, $name) = split (' ');	116
61	next unless ($name =~ /\S/);	117	# Read init.pre from each directory, then glob for the rest.
62	$falsefreqs_spam{$name} = $spam;	118
63	$falsefreqs_nons{$name} = $nons;	119	my $cf_txt = '';
64	$falsefreqs_so{$name} = $so;	120	my @files;
		121	my @dirs;
		122	foreach my $file (@$opt_c) {
		123	if (-d $file) {
		124	if (-r "$file/init.pre") {
		125	push @files, "$file/init.pre";
		126	}
		127	push @dirs, $file;
65	}	128	}
		129	else {
		130	push @files, $file;
		131	}
66	}	132	}
67	close FALSE;	133	foreach my $dir (@dirs) {
68	}	134	my @cfs = glob("$dir/*.cf");
		135	push @files, grep { -r $_ } @cfs;
		136	}
69		137
70	while (<>) {	138	foreach my $file (@files) {
71	if (!/^\s*([\d\.]+)/) {	139	if (-r $file) {
72	$output{'a_header'} = $_; next;	140	open IN, $file;
		141	$cf_txt .= "file start $file\n";
		142	$cf_txt .= join('', <IN>);
		143	$cf_txt .= "\nfile end $file\n";
		144	close IN;
		145	}
73	}	146	}
74		147
		148	my $spamtest = new Mail::SpamAssassin({config_text => $cf_txt});
		149
		150	$spamtest->lint_rules();
		151	}
		152
		153
		154	# Next, check for other stuff
		155	my $masses = Mail::SpamAssassin::Masses->new({rulesdir => $opt_c,
		156	scoreset => $opt_s, #,,
		157	falses => $opt_f,
		158	logfile => $opt_l});
		159
		160	$masses->readlogs();
		161	$masses->do_statistics();
		162
		163	my $rules = $masses->get_rules_array();
		164
		165
		166	my %output;
		167
		168	foreach my $rule (@$rules) {
		169
75	my $badrule;	170	my $badrule;
76	my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
77	next unless ($name =~ /\S/);
78		171
79	my $ffspam = $falsefreqs_spam{$name};	172	next if ($rule->{tflags} =~ /\bnet\b/ && ($opt_s % 2) == 0);
80	my $ffnons = $falsefreqs_nons{$name};	173	next if ($rule->{tflags} =~ /\buserconf\b/);
81	my $ffso = $falsefreqs_so{$name};
82		174
83	my $tf = $tflags{$name};	175	if ($rule->{freq_spam} == 0 && $rule->{freq_ham} == 0) { # sanity!
84	next if ($tf =~ /net/ && ($scoreset % 2) == 0);
85	next if ($tf =~ /userconf/);
86		176
87	if ($overall == 0.0 && $spam == 0.0 && $nons == 0.0) { # sanity!
88	$badrule = 'no matches';	177	$badrule = 'no matches';
89		178
90	} else {	179	} else {
91	if ($score < 0.0) {	180	if ($rule->{score} < 0.0) {
92	# negative score with more spams than nonspams? bad rule.	181	# negative score with more spams than nonspams? bad rule.
93	if ($tf !~ /nice/ && $so > 0.5 && $score < 0.5) {	182	if (!$rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) {
94	$badrule = 'non-nice but -ve score';	183	$badrule = 'non-nice but -ve score';
95	}	184	}
96		185	if ($rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) {
97	if ($tf =~ /nice/ && $so > 0.5 && $score < 0.5) {	186	if ($opt_f && $rule->{freq_fn} < $rule->{freq_fp}) {
98	if ($ffso < 0.5) {
99	$badrule = 'fn';	187	$badrule = 'fn';
100	} else {
101	# ignore, the FNs are overridden by other tests so it doesn't
102	# affect the overall results.
103	}	188	}
		189	# else {
		190	# ignore, the FNs are overridden by other tests so it doesn't
		191	# affect the overall results.
		192	# }
104	}	193	}
105		194
106	# low number of matches overall	195	# low number of matches overall
107	if ($nons < $LOW_MATCHES_PERCENT)	196	if ($rule->{ham_percent} < $LOW_MATCHES_PERCENT)
108	{ $badrule \|\|= ''; $badrule .= ', low matches'; }	197	{ $badrule \|\|= ''; $badrule .= ', low matches'; }
109		198
110	} elsif ($score > 0.0) {	199	} elsif ($rule->{score} > 0.0) {
111	# positive score with more nonspams than spams? bad.	200	# positive score with more nonspams than spams? bad.
112	if ($tf =~ /nice/ && $so < 0.5 && $score > 0.5) {	201	if ($rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) {
113	$badrule = 'nice but +ve score';	202	$badrule = 'nice but +ve score';
114	}	203	}
115		204
116	if ($tf !~ /nice/ && $so < 0.5 && $score > 0.5) {	205	if (!$rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) {
117	if ($ffso > 0.5) {	206	if ($opt_f && $rule->{freq_fp} > $rule->{freq_fn}) {
118	$badrule = 'fp';	207	$badrule = 'fp';
119	} else {
120	# ignore, the FPs are overridden by other tests so it doesn't
121	# affect the overall results.
122	}	208	}
		209	# else {
		210	# ignore, the FPs are overridden by other tests so it doesn't
		211	# affect the overall results.
		212	# }
123	}	213	}
124		214
125	# low number of matches overall	215	# low number of matches overall
126	if ($spam < $LOW_MATCHES_PERCENT)	216	if ($rule->{spam_percent} < $LOW_MATCHES_PERCENT)
127	{ $badrule \|\|= ''; $badrule .= ', low matches'; }	217	{ $badrule \|\|= ''; $badrule .= ', low matches'; }
128		218
129	} elsif ($score == 0.0) {	219	} elsif ($rule->{score} == 0.0) {
130	$badrule = 'score is 0';	220	$badrule = 'score is 0';
131	}	221	}
132	}	222	}
133		223
134	if (defined $badrule) {	224	if (defined $badrule) {
135	$badrule =~ s/^, //; chomp;	225	$badrule =~ s/^, //;
136	$output{$badrule} .= $_ . " ($badrule)\n";	226	$output{$badrule} .= $rule->{name} . " ($badrule)\n";
137	}	227	}
138	}	228	}
139		229
Lines 156-337 Link Here
156	exit;	246	exit;
157		247
158		248
159	sub concat_rule_lang {
160	my $rule = shift;
161	my $lang = shift;
162
163	if (defined $lang && $lang ne '') {
164	return "[$lang]_$rule";
165	} else {
166	return $rule;
167	}
168	}
169
170	# note: do not use parse-rules-for-masses here, we need to do linting instead
171	# of your average parse
172	sub readrules {
173	my @files = <../rules/[0-9]*.cf>;
174	my $file;
175	%rulesfound = ();
176	%langs = ();
177	foreach $file (@files) {
178	open (IN, "<$file");
179	while (<IN>) {
180	s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;
181
182	# make all the foo-bar stuff foo_bar
183	1 while s/^(\S+)-/\1_/g;
184	1 while s/^(lang\s+\S+\s+\S+)-/\1_/g;
185
186	my $lang = '';
187	if (s/^lang\s+(\S+)\s+//) {
188	$lang = $1; $langs{$1} = 1;
189	}
190
191	if (/^(header\|rawbody\|body\|full\|uri\|meta)\s+(\S+)\s+/) {
192	$rulesfound{$2} = 1;
193	$rulefile{$2} \|\|= $file;
194	$scorefile{$1} = $file;
195	$score{$2} \|\|= 1.0;
196	$tflags{$2} \|\|= '';
197	$descfile{$2} \|\|= $file; # a rule with no score or desc is OK
198	$description{$2}->{$lang} = undef;
199
200	if (/^body\s+\S+\s+eval:/) {
201	# ignored
202	} elsif (/^body\s+\S+\s+(.*)$/) {
203	my $re = $1;
204
205	# If there's a ( in a rule where it should be (?:, flag it.
206	# but ignore [abc(] ...
207	if ($re =~ /[^\\]\([^\?]/ && $re !~ /\[[^\]]*[^\\]\(/) {
208	print "warning: non-(?:...) capture in regexp in $file: $_\n";
209	}
210	if ($re =~ /\.[\*\+]/) {
211	print "warning: .* in regexp in $file: $_\n";
212	}
213	if ($re =~ /[^\\]\{(\d),?(\d?)\}/) {
214	if ($1 > 120 \|\| $2 > 120) {
215	print "warning: long .{n} in regexp in $file: $_\n";
216	}
217	}
218	}
219
220	} elsif (/^describe\s+(\S+)\s+(.?)\s$/) {
221	$rulesfound{$1} = 1;
222	$descfile{concat_rule_lang ($1, $lang)} \|\|= $file;
223	$descfile{$1} \|\|= $file;
224	$description{$1}->{$lang} = $2;
225	} elsif (/^tflags\s+(\S+)\s+(.+)$/) {
226	$rulesfound{$1} = 1;
227	$tflags{$1} = $2;
228	$tflagsfile{concat_rule_lang ($1, $lang)} = $file;
229	$tflagsfile{$1} = $file;
230	} elsif (/^score\s+(\S+)\s+(.+)$/) {
231	$rulesfound{$1} = 1;
232	$scorefile{concat_rule_lang ($1, $lang)} = $file;
233	$scorefile{$1} = $file;
234	$score{$1} = $2;
235	} elsif (/^(clear_report_template\|clear_spamtrap_template\|report\|spamtrap\|
236	clear_terse_report_template\|terse_report\|
237	required_score\|ok_locales\|ok_languages\|test\|lang\|
238	spamphrase\|whitelist_from\|require_version\|
239	clear_unsafe_report_template\|unsafe_report\|
240	(?:bayes_)?auto_learn_threshold_nonspam\|(?:bayes_)?auto_learn_threshold_spam\|
241	(?:bayes_)?auto_learn
242	)/x) {
243	next;
244	} else {
245	print "warning: unknown rule in $file: $_\n";
246	}
247	}
248	close IN;
249	}
250	@langsfound = sort keys %langs;
251	@rulesfound = sort keys %rulesfound;
252	}
253
254	sub lintrules {
255	my %possible_renames = ();
256
257	foreach my $rule (@rulesfound) {
258	my $match = $rule;
259	$match =~ s/_\d+[^_]+$//gs; # trim e.g. "_20K"
260	$match =~ s/[^A-Z]+//gs; # trim numbers etc.
261
262	if (defined ($rulefile{$rule}) && $possible_renames{$match} !~ / \Q$rule\E\b/) {
263	$possible_renames{$match} .= " ".$rule;
264	}
265	$possible_rename_matches{$rule} = $match;
266	}
267
268	foreach my $lang ('', @langsfound) {
269	foreach my $baserule (@rulesfound) {
270	next if ( $baserule =~ /^__/ \|\| $baserule =~ /^T_/ );
271
272	my $rule = concat_rule_lang ($baserule, $lang);
273	my $f = $descfile{$rule};
274	my $warned = '';
275
276	if (defined $f && !defined ($rulefile{$rule})
277	&& !defined ($rulefile{$baserule}))
278	{
279	print "warning: $baserule has description, but no rule: $f\n";
280	$warned .= ' lamedesc';
281	}
282
283	# Check our convention for rule length
284	if ( (($lang ne '' && defined($rulefile{$rule})) \|\| ($lang eq '' && defined ($rulefile{$baserule}))) && length $baserule > 22 ) {
285	print "warning: $baserule has a name longer than 22 chars: $f\n";
286	}
287	# Check our convention for rule length
288	if ( (($lang ne '' && defined($rulefile{$rule})) \|\| ($lang eq '' && defined ($rulefile{$baserule}))) && defined $description{$baserule}->{$lang} && length $description{$baserule}->{$lang} > 50 ) {
289	print "warning: $baserule has a description longer than 50 chars: $f\n";
290	}
291
292	# lang rule trumps normal rule
293	$f = $rulefile{$rule} \|\| $rulefile{$baserule};
294	# if the rule exists, and the language/rule description doesn't exist ...
295	if ( defined $f && !defined $description{$baserule}->{$lang} )
296	{
297	print "warning: $baserule exists, ",( $lang ne '' ? "lang $lang, " : "" ),"but has no description: $f\n";
298	$warned .= ' lamedesc';
299	}
300
301
302	$f = $scorefile{$rule};
303	if (defined $f && !defined ($rulefile{$rule})
304	&& !defined ($rulefile{$baserule}))
305	{
306	print "warning: $baserule has score, but no rule: $f\n";
307	$warned .= ' lamescore';
308	}
309
310	my $r = $possible_rename_matches{$rule};
311	if ($warned ne '' && defined $r) {
312	my @matches = split (' ', $possible_renames{$r});
313	if (scalar @matches != 0) {
314	my $text = '';
315
316	# now try and figure out "nearby" rules with no description/score
317	foreach my $baser (@matches) {
318	my $blang;
319	if ($descfile{$rule} =~ /text_(\S\S)\./) {
320	$blang = $1;
321	}
322	my $r = concat_rule_lang ($baser, $blang);
323	#warn "$r $descfile{$r} $descfile{$baser}";
324	next if ($warned =~ /lamedesc/ && (defined $descfile{$r}));
325	next if ($warned =~ /lamescore/ && (defined $scorefile{$r}));
326	$text .= " $baser";
327	}
328
329	if ($text ne '') {
330	print "warning: (possible renamed rule? $text)\n";
331	}
332	}
333	}
334	}
335	}
336	}
337

Lines 3-36 Link Here

(-)masses/Makefile (-16 / +9 lines)
3	LDFLAGS= -lm	3	LDFLAGS= -lm
4		4
5	# What rule scoreset are we using?	5	# What rule scoreset are we using?
6	SCORESET = 0	6	SCORESET = 3
		7	LOGFILE = masses.log
7		8
8	#### Should be no need to modify below this line	9	#### Should be no need to modify below this line
9		10
10	all: badrules perceptron	11	all: badrules perceptron
11		12
12	perceptron: perceptron.o	13	perceptron: perceptron.o
13	$(CC) -o perceptron perceptron.o $(LDFLAGS)	14	$(CC) -o perceptron perceptron.o $(LDFLAGS)
14		15
15	perceptron.o: tmp/rules.pl tmp/tests.h tmp/scores.h	16	perceptron.o: tmp/tests.h
16	$(CC) $(CFLAGS) -c -o perceptron.o perceptron.c	17	$(CC) $(CFLAGS) -c -o perceptron.o perceptron.c
17		18
18	tmp/rules.pl: tmp/.created parse-rules-for-masses	19	tmp/tests.h: tmp/.created logs-to-c
19	perl parse-rules-for-masses -d ../rules -s $(SCORESET)	20	perl logs-to-c --scoreset=$(SCORESET) --logfile=$(LOGFILE)
20		21
21	tmp/tests.h: tmp/.created tmp/ranges.data logs-to-c	22	freqs: masses.log
22	perl logs-to-c --scoreset=$(SCORESET)	23	perl hit-frequencies -x -p -s $(SCORESET) --logfile=$(LOGFILE) > freqs
23		24
24	tmp/scores.h: tmp/tests.h
25
26	tmp/ranges.data: tmp/.created freqs score-ranges-from-freqs
27	perl score-ranges-from-freqs ../rules $(SCORESET) < freqs
28
29	freqs: spam.log ham.log
30	perl hit-frequencies -x -p -s $(SCORESET) > freqs
31
32	badrules: freqs	25	badrules: freqs
33	perl lint-rules-from-freqs < freqs > badrules	26	perl lint-rules-from-freqs -s $(SCORESET) --logfile=$(LOGFILE) > badrules
34		27
35	tmp/.created:	28	tmp/.created:
36	-mkdir tmp	29	-mkdir tmp

Lines 16-159 Link Here

(-)masses/mass-check (-110 / +237 lines)
16	# limitations under the License.	16	# limitations under the License.
17	# </@LICENSE>	17	# </@LICENSE>
18		18
19	sub usage {	19	=head1 NAME
20	die <<ENDOFUSAGE;
21	usage: mass-check [options] target ...
22
23	-c=file set configuration/rules directory
24	-p=dir set user-prefs directory
25	-f=file read list of targets from <file>
26	-j=jobs specify the number of processes to run simultaneously
27	--net turn on network checks!
28	--mid report Message-ID from each message
29	--debug report debugging information
30	--progress show progress updates during check
31	--rewrite=OUT save rewritten message to OUT (default is /tmp/out)
32	--showdots print a dot for each scanned message
33	--rules=RE Only test rules matching the given regexp RE
34	--restart=N restart all of the children after processing N messages
35	--deencap=RE Extract SpamAssassin-encapsulated spam mails only if they
36	were encapsulated by servers matching the regexp RE
37	(default = extract all SpamAssassin-encapsulated mails)
38
39	log options
40	-o write all logs to stdout
41	--loghits log the text hit for patterns (useful for debugging)
42	--loguris log the URIs found
43	--hamlog=log use <log> as ham log ('ham.log' is default)
44	--spamlog=log use <log> as spam log ('spam.log' is default)
45
46	message selection options
47	-n no date sorting or spam/ham interleaving
48	--after=N only test mails received after time_t N (negative values
49	are an offset from current time, e.g. -86400 = last day)
50	or after date as parsed by Time::ParseDate (e.g. '-6 months')
51	--before=N same as --after, except received times are before time_t N
52	--all don't skip big messages
53	--head=N only check first N ham and N spam (N messages if -n used)
54	--tail=N only check last N ham and N spam (N messages if -n used)
55
56	simple target options (implies -o and no ham/spam classification)
57	--dir subsequent targets are directories
58	--file subsequent targets are files in RFC 822 format
59	--mbox subsequent targets are mbox files
60	--mbx subsequent targets are mbx files
61
62	Just left over functions we should remove at some point:
63	--bayes report score from Bayesian classifier
64
65	non-option arguments are used as target names (mail files and folders),
66	the target format is: <class>:<format>:<location>
67	<class> is "spam" or "ham"
68	<format> is "dir", "file", "mbx", or "mbox"
69	<location> is a file or directory name. globbing of ~ and * is supported
70		20
71	ENDOFUSAGE	21	mass-check - Generates SpamAssassin scores and results for large
72	}	22	amounts of mail
73		23
		24	=head1 SYNOPSIS
		25
		26	mass-check [options] class:format:location ...
		27	mass-check [options] {--dir \| --file \| --mbox} target ...
		28	mass-check [options] -f file
		29
		30	Options:
		31	-f=file read list of targets from <file>
		32	-j=jobs specify the number of processes to run simultaneously
		33	--net turn on network checks!
		34	--mid report Message-ID from each message
		35	--debug report debugging information
		36	--progress show progress updates during check
		37	--rewrite=OUT save rewritten message to OUT (default is /tmp/out)
		38	--showdots print a dot for each scanned message
		39	--rules=RE Only test rules matching the given regexp RE
		40	--restart=N restart all of the children after processing N messages
		41
		42	SpamAssassin options
		43	-c=dir set configuration/rules directory
		44	-p=file set user preferences file (default: none)
		45	-s=dir set site rules configuration directory
		46	-u=dir set user-state directory
		47	--dist assumes the script is being run from the masses/ dir of
		48	the unpacked tarball, and makes appropriate guesses for
		49	-p and -c
		50	--deencap=RE Extract SpamAssassin-encapsulated spam mails only if they
		51	were encapsulated by servers matching the regexp RE
		52	(default = extract all SpamAssassin-encapsulated mails)
		53
		54	log options
		55	-o write all logs to stdout
		56	--loghits log the text hit for patterns (useful for debugging)
		57	--loguris log the URIs found
		58	--log=file log to <file> (masses.log is default)
		59
		60	message selection options
		61	-n no date sorting or spam/ham interleaving
		62	--after=N only test mails received after time_t N (negative values
		63	are an offset from current time, e.g. -86400 = last day)
		64	or after date as parsed by Time::ParseDate (e.g. '-6 months')
		65	--before=N same as --after, except received times are before time_t N
		66	--all don't skip big messages
		67	--head=N only check first N ham and N spam (N messages if -n used)
		68	--tail=N only check last N ham and N spam (N messages if -n used)
		69
		70	simple target options (implies -o and no ham/spam classification)
		71	--dir subsequent targets are directories
		72	--file subsequent targets are files in RFC 822 format
		73	--mbox subsequent targets are mbox files
		74	--mbx subsequent targets are mbx files
		75
		76	Just left over functions we should remove at some point:
		77	--bayes report score from Bayesian classifier
		78	--hamlog=log use <log> as ham log ('ham.log' is default)
		79	--spamlog=log use <log> as spam log ('spam.log' is default)
		80
		81	=head1 DESCRIPTION
		82
		83	B<mass-check> is designed to assist with rule development and
		84	generation of SpamAssassin scored. It reads in mail from the
		85	location(s) specified on the command line (in the first form above),
		86	given in the form I<class:format:location>, where I<class> is either
		87	"spam" or "ham" (non-spam), I<format> is one of "dir" (Maildirs, MH,
88	etc), "file", "mbox" (mboxes can be gzipped) or "mbx".
89
90	B<mass-check> will analyze each message using SpamAssassin and
91	generate one-line of output per message, (by default to masses.log) in
92	the following format:
93
94	{s\|h} {s\|h} score filename tests-hit
95
96	The first field is the message's class as given on the command line
97	(ham or spam). The second is the message's class as determined by
98	SpamAssassin. The third is the message's score, as determined by
99	SpamAssassin. The fourth field contains the message's filename; for
100	mboxes, this contains the filename and the byte offset from the
101	beginning of the file separated by a period. The last field contains a
102	list of all the tests the message hit separated by commas.
103
104	If you want to run this on the currently installed version of
105	SpamAssassin's rules for sitewide use, make sure your user_prefs file
106	contains no rules.
107
108	=head1 BUGS
109
110	Please report bugs to http://bugzilla.spamassassin.org/
111
112	=head1 SEE ALSO
113
114	L<hit-frequencies(1)>, L<logs-to-c(1)>, L<Mail::SpamAssassin::Masses(3)>,
115	L<perceptron(1)>
116
117	=cut
118
74	###########################################################################	119	###########################################################################
75		120
76	use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all $opt_bayes	121	use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all
77	$opt_debug $opt_format $opt_hamlog $opt_head $opt_loghits	122	$opt_bayes $opt_before $opt_debug $opt_dist $opt_format
78	$opt_mid $opt_mh $opt_ms $opt_net $opt_nosort $opt_progress	123	$opt_hamlog $opt_head $opt_log $opt_loghits $opt_mid
79	$opt_showdots $opt_spamlog $opt_tail $opt_rules $opt_restart	124	$opt_mh $opt_ms $opt_net $opt_nosort $opt_p $opt_progress
80	$opt_loguris $opt_after $opt_before $opt_rewrite $opt_deencap);	125	$opt_s $opt_showdots $opt_spamlog $opt_tail $opt_rules
		126	$opt_restart $opt_loguris $opt_after $opt_rewrite $opt_u
		127	$opt_deencap);
81		128
82	use FindBin;	129	use FindBin;
83	use lib "$FindBin::Bin/../lib";	130	use lib "$FindBin::Bin/../lib";
84	eval "use bytes";	131	eval "use bytes";
85	use Mail::SpamAssassin::ArchiveIterator;	132	use Mail::SpamAssassin::ArchiveIterator;
86	use Mail::SpamAssassin;	133	use Mail::SpamAssassin;
87	use Getopt::Long;	134	use Getopt::Long qw(:config bundling auto_help);
		135	use Pod::Usage;
88	use POSIX qw(strftime);	136	use POSIX qw(strftime);
89	use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };	137	use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };
		138	use strict; # Why wasn't this on?
90	use Config;	139	use Config;
91		140
92	# default settings	141	# default settings
93	$opt_c = "$FindBin::Bin/../rules";	142
94	$opt_p = "$FindBin::Bin/spamassassin";
95	$opt_j = 1;	143	$opt_j = 1;
96	$opt_net = 0;	144	$opt_net = 0;
97	$opt_hamlog = "ham.log";	145	$opt_log = "masses.log";
98	$opt_spamlog = "spam.log";
99		146
100	GetOptions("c=s", "p=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug",	147	GetOptions("c\|cffile=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug",
101	"hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net",	148	"deencap=s", "dist!", "hamlog=s", "head=i", "log=s",
102	"progress", "rewrite:s", "showdots", "spamlog=s", "tail=i",	149	"loghits", "mh", "mid", "ms", "net", "p=s", "progress",
103	"rules=s", "restart=i", "after=s", "before=s", "loguris", "deencap=s",	150	"rewrite:s", "s=s", "showdots", "spamlog=s", "tail=i",
		151	"rules=s", "restart=i", "u=s", "after=s", "loguris",
104	"dir" => sub { $opt_format = "dir"; },	152	"dir" => sub { $opt_format = "dir"; },
105	"file" => sub { $opt_format = "file"; },	153	"file" => sub {$opt_format = "file"; },
106	"mbox" => sub { $opt_format = "mbox"; },	154	"mbox" => sub { $opt_format = "mbox"; },
107	"mbx" => sub { $opt_format = "mbx"; },	155	"mbx" => sub { $opt_format = "mbx"; },
108	'<>' => \&target) or usage();	156	'<>' => \&target);
109		157
		158	if ($opt_hamlog \|\| $opt_spamlog) { # Old style logging
		159	$opt_hamlog \|\|= "ham.log";
		160	$opt_spamlog \|\|= "spam.log";
		161	}
		162
		163	my @targets;
		164
110	if ($opt_f) {	165	if ($opt_f) {
111	open(F, $opt_f) \|\| die $!;	166	open(F, $opt_f) \|\| die $!;
112	push(@targets, map { chomp; $_ } <F>);	167	push(@targets, map { chomp; $_ } <F>);
113	close(F);	168	close(F);
114	}	169	}
115		170
116	if (scalar @targets == 0) { usage(); }	171	if (scalar @targets == 0) { pod2usage("No target defined!"); }
117		172
118	#if ($opt_ms) {	173	# Auto-detect --dist option
119	#find_missed($opt_spamlog);	174	if (!defined $opt_dist) {
120	#}	175	if (-f "$FindBin::Bin/../spamassassin.raw") {
121	#elsif ($opt_mh) {	176	warn "Automatically using --dist. Assuming you are running from the unpacked tarball. Use --no-dist to override.";
122	#find_missed($opt_hamlog);	177	$opt_dist = 1;
123	#}	178	}
		179	}
124		180
125	$spamtest = new Mail::SpamAssassin ({	181	my $local_rules_dir;
126	'debug' => $opt_debug,
127	'rules_filename' => $opt_c,
128	'userprefs_filename' => "$opt_p/user_prefs",
129	'site_rules_filename' => "$opt_p/local.cf",
130	'userstate_dir' => "$opt_p",
131	'save_pattern_hits' => $opt_loghits,
132	'dont_copy_prefs' => 1,
133	'local_tests_only' => $opt_net ? 0 : 1,
134	'only_these_rules' => $opt_rules,
135	'ignore_safety_expire_timeout' => 1,
136	PREFIX => '',
137	DEF_RULES_DIR => $opt_c,
138	LOCAL_RULES_DIR => '',
139	});
140		182
		183	if ($opt_dist) { # Set defaults
		184	$opt_c \|\|= "$FindBin::Bin/../rules";
		185	$opt_p \|\|= "$FindBin::Bin/mass-check.cf";
		186	$opt_u \|\|= "$FindBin::Bin/spamassassin";
		187	$opt_s \|\|= "$FindBin::Bin/spamassassin";
		188	$local_rules_dir = '';
		189	}
		190	else {
		191	if(!$opt_u) {
		192	# Assuming this is OK, since mass-check isnt supported on windows, is it?
		193	# Also, should there be some check to make sure that previous mass-check stuff isn't in there?
		194	# AFAICT, there isn't otherwise....
		195	if ( -d "${ENV{HOME}}/.spamassassin" ) {
		196	$opt_u = "${ENV{HOME}}/.spamassassin/mass-check";
		197	warn "$opt_u already exists -- may contain files that will effect the results" if (-d $opt_u);
		198	mkdir $opt_u, 0700 if (! -d $opt_u);
		199	}
		200	}
		201
		202	# Leave the rest to SA, we'll get it afterwards
		203
		204	}
		205
		206
		207	$opt_s =~ s/~/$ENV{HOME}/ if $opt_s;
		208	$opt_c =~ s/~/$ENV{HOME}/ if $opt_c;
		209	$opt_p =~ s/~/$ENV{HOME}/ if $opt_p;
		210	$opt_u =~ s/~/$ENV{HOME}/ if $opt_u;
		211
		212
		213	my $spamtest = new Mail::SpamAssassin ({
		214	'debug' => $opt_debug,
		215	'rules_filename' => $opt_c,
		216	'userprefs_filename' => $opt_p,
		217	'site_rules_filename' => $opt_s,
		218	'userstate_dir' => $opt_u,
		219	'save_pattern_hits' => $opt_loghits,
		220	'dont_copy_prefs' => 1,
		221	'local_tests_only' => $opt_net ? 0 : 1,
		222	'only_these_rules' => $opt_rules,
		223	'ignore_safety_expire_timeout' => 1,
		224	DEF_RULES_DIR => $opt_c,
		225	LOCAL_RULES_DIR => $local_rules_dir,
		226	});
		227
141	$spamtest->compile_now(1);	228	$spamtest->compile_now(1);
142	$spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf");	229	if ($opt_dist) {
		230	$spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf");
		231	}
143		232
144	my $who = `id -un 2>/dev/null`; chomp $who;	233	my $who = `id -un 2>/dev/null`; chomp $who;
145	my $where = `uname -n 2>/dev/null`; chomp $where;	234	my $where = `uname -n 2>/dev/null`; chomp $where;
146	my $when = `date -u`; chomp $when;	235	my $when = `date -u`; chomp $when;
147	my $revision = "unknown";	236	my $revision;
148	if (open(TESTING, "$opt_c/70_testing.cf")) {	237
149	chomp($revision = <TESTING>);	238	if ($opt_dist) {
150	$revision =~ s/.\$Rev:\s(\S+).*/$1/;	239	my $rev = "unknown";
151	close(TESTING);	240	if (open(TESTING, "$opt_c/70_testing.cf")) {
		241	chomp($rev = <TESTING>);
		242	$rev =~ s/.\$Rev:\s(\S+).*/$1/;
		243	close(TESTING);
		244	}
		245	$revision = "SVN revision: $rev";
152	}	246	}
		247	else {
		248	$revision = "Local";
		249	}
		250
153	my $log_header = "# mass-check results from $who\@$where, on $when\n" .	251	my $log_header = "# mass-check results from $who\@$where, on $when\n" .
154	"# M:SA version ".$spamtest->Version()."\n" .	252	"# M:SA version ".$spamtest->Version()."\n" .
155	"# SVN revision: $revision\n" .	253	"# $revision\n" .
156	"# Perl version: $] on $Config{archname}\n";	254	"# Perl version: $] on $Config{archname}\n";
		255
		256	if (!$opt_dist) {
		257	my @paths = ( $spamtest->{rules_filename}, $spamtest->{site_rules_filename}, $spamtest->{userprefs_filename} );
		258	$log_header .= "# Using configuration:\n";
		259	foreach my $file (@paths) {
		260	$log_header .= "# $file\n";
		261	}
		262	}
		263
157	my $host = $ENV{'HOSTNAME'} \|\| $ENV{'HOST'} \|\| `hostname` \|\| 'localhost';	264	my $host = $ENV{'HOSTNAME'} \|\| $ENV{'HOST'} \|\| `hostname` \|\| 'localhost';
158	chomp $host;	265	chomp $host;
159		266
Lines 222-228 Link Here
222	autoflush STDOUT 1;	329	autoflush STDOUT 1;
223	print STDOUT $log_header;	330	print STDOUT $log_header;
224	}	331	}
225	else {	332	elsif ($opt_hamlog \|\| $opt_spamlog) {
226	open(HAM, "> $opt_hamlog");	333	open(HAM, "> $opt_hamlog");
227	open(SPAM, "> $opt_spamlog");	334	open(SPAM, "> $opt_spamlog");
228	autoflush HAM 1;	335	autoflush HAM 1;
Lines 230-235 Link Here
230	print HAM $log_header;	337	print HAM $log_header;
231	print SPAM $log_header;	338	print SPAM $log_header;
232	}	339	}
		340	else {
		341	open(OUT, "> $opt_log");
		342	autoflush OUT 1;
		343	print OUT $log_header;
		344	}
233	$init_results = 1;	345	$init_results = 1;
234	}	346	}
235		347
Lines 239-263 Link Here
239	# don't open results files until we get here to avoid overwriting files	351	# don't open results files until we get here to avoid overwriting files
240	&init_results if !$init_results;	352	&init_results if !$init_results;
241		353
242	if ($class eq "s") {	354	if ($opt_o) {
243	if ($opt_o) { print STDOUT $result; } else { print SPAM $result; }	355	print STDOUT $result;
244	$spam_count++;
245	}	356	}
246	elsif ($class eq "h") {	357	elsif ($opt_spamlog \|\| $opt_hamlog) {
247	if ($opt_o) { print STDOUT $result; } else { print HAM $result; }	358	if ($class eq "s") {
248	$ham_count++;	359	print SPAM $result;
		360	} else {
		361	print HAM $result;
		362	}
249	}	363	}
		364	else {
		365	print OUT $result;
		366	}
250		367
251	$total_count++;	368	$total_count++;
252	#warn ">> result: $total_count $class $time\n";	369	#warn ">> result: $total_count $class $time\n";
253		370
254	if ($opt_progress) {	371	if ($opt_progress) {
		372	if ($class eq "s") {
		373	$spam_count++;
		374	}
		375	else {
		376	$ham_count++;
		377	}
255	progress($time);	378	progress($time);
256	}	379	}
257	}	380	}
258		381
259	sub wanted {	382	sub wanted {
260	my (undef, $id, $time, $dataref) = @_;	383	my ($class, $id, $time, $dataref) = @_;
261	my $out;	384	my $out;
262		385
263	my $ma = $spamtest->parse($dataref, 1);	386	my $ma = $spamtest->parse($dataref, 1);
Lines 308-325 Link Here
308	push(@extra, "mid=$mid");	431	push(@extra, "mid=$mid");
309	}	432	}
310		433
311	my $yorn;	434	my $result;
312	my $score;	435	my $score;
313	my $tests;	436	my $tests;
314	my $extra;	437	my $extra;
315		438
316	if ($opt_loguris) {	439	if ($opt_loguris) {
317	$yorn = '.';	440	$result = '.';
318	$score = 0;	441	$score = 0;
319	$tests = join(" ", sort @uris);	442	$tests = join(" ", sort @uris);
320	$extra = '';	443	$extra = '';
321	} else {	444	} else {
322	$yorn = $status->is_spam() ? 'Y' : '.';	445	if ($status->is_spam()) {
		446	$result = "s";
		447	} else {
		448	$result = "h";
		449	}
323	$score = $status->get_score();	450	$score = $status->get_score();
324	$tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit())));	451	$tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit())));
325	$extra = join(",", @extra);	452	$extra = join(",", @extra);
Lines 333-339 Link Here
333		460
334	$id =~ s/\s/_/g;	461	$id =~ s/\s/_/g;
335		462
336	$out .= sprintf("%s %2d %s %s %s\n", $yorn, $score, $id, $tests, $extra);	463	$out .= sprintf("%s %s %05.2f %s %s %s\n", $class, $result, $score, $id, $tests, $extra);
337		464
338	if ($tests =~ /MICROSOFT_EXECUTABLE\|MIME_SUSPECT_NAME/) {	465	if ($tests =~ /MICROSOFT_EXECUTABLE\|MIME_SUSPECT_NAME/) {
339	$out .= logkilled($ma, $id, "possible virus");	466	$out .= logkilled($ma, $id, "possible virus");

Lines 33-40 Link Here

(-)masses/README (-13 / +10 lines)
33		33
34	See the CORPUS_POLICY file for more details.	34	See the CORPUS_POLICY file for more details.
35		35
36
37
38	HOW TO SUBMIT RESULTS BACK TO US	36	HOW TO SUBMIT RESULTS BACK TO US
39	--------------------------------	37	--------------------------------
40		38
Lines 52-62 Link Here
52	This script is used to perform "mass checks" of a set of mailboxes, Cyrus	50	This script is used to perform "mass checks" of a set of mailboxes, Cyrus
53	folders, and/or MH mail spools. It generates summary lines like this:	51	folders, and/or MH mail spools. It generates summary lines like this:
54		52
55	Y 7 /home/jm/Mail/Sapm/1382 SUBJ_ALL_CAPS,SUPERLONG_LINE,SUBJ_FULL_OF_8BITS	53	s s 07.22 /home/jm/Mail/Sapm/1382 SUBJ_ALL_CAPS,SUPERLONG_LINE,SUBJ_FULL_OF_8BITS
56		54
57	or for mailboxes,	55	or for mailboxes,
58		56
59	. 1 /path/to/mbox:<5.1.0.14.2.20011004073932.05f4fd28@localhost> TRACKER_ID,BALANCE_FOR_LONG	57	h h 01.32 /path/to/mbox:<5.1.0.14.2.20011004073932.05f4fd28@localhost> TRACKER_ID,BALANCE_FOR_LONG
60		58
61	listing the path to the message or its message ID, its score, and the tests	59	listing the path to the message or its message ID, its score, and the tests
62	that triggered on that mail.	60	that triggered on that mail.
Lines 65-87 Link Here
65	get good hits with few false positives, etc., and re-score the tests to	63	get good hits with few false positives, etc., and re-score the tests to
66	optimise the ratio.	64	optimise the ratio.
67		65
68	This script relies on the spamassassin distribution directory living in "..".	66	If given the --dist option, this script relies on the spamassassin
		67	distribution directory living in "..". If this script is not in the
		68	distribution directory, it will generate logs based on the site-wide
		69	rules, as well as personal rules.
69		70
70
71	logs-to-c :	71	logs-to-c :
72		72
73	Takes the "spam.log" and "nonspam.log" files and converts them into C	73	Takes the "masses.log" file and converts them into C source files
74	source files and simplified data files for use by the C score optimization	74	and simplified data files for use by the C score optimization
75	algorithm. (Called by "make" when you build the perceptron, so generally	75	algorithm. (Called by "make" when you build the perceptron, so
76	you won't need to run it yourself.)	76	generally you won't need to run it yourself.)
77		77
78
79	hit-frequencies :	78	hit-frequencies :
80		79
81	Analyses the log files and computes how often each test hits, overall,	80	Analyses the log files and computes how often each test hits, overall,
82	for spam mails and for non-spam.	81	for spam mails and for non-spam.
83		82
84
85	mk-baseline-results :	83	mk-baseline-results :
86		84
87	Compute results for the baseline scores (read from ../rules/*). If you	85	Compute results for the baseline scores (read from ../rules/*). If you
Lines 91-97 Link Here
91	It will output statistics on the current ruleset to ../rules/STATISTICS.txt,	89	It will output statistics on the current ruleset to ../rules/STATISTICS.txt,
92	suitable for a release build of SpamAssassin.	90	suitable for a release build of SpamAssassin.
93		91
94
95	perceptron.c :	92	perceptron.c :
96		93
97	Perceptron learner by Henry Stern. See "README.perceptron" for details.	94	Perceptron learner by Henry Stern. See "README.perceptron" for details.

Lines 1-3 Link Here

(-)masses/fp-fn-statistics (-2 / +190 lines)
1	#!/bin/sh	1	#!/usr/bin/perl -w
		2	#
		3	# <@LICENSE>
		4	# Copyright 2004 Apache Software Foundation
		5	#
		6	# Licensed under the Apache License, Version 2.0 (the "License");
		7	# you may not use this file except in compliance with the License.
		8	# You may obtain a copy of the License at
		9	#
		10	# http://www.apache.org/licenses/LICENSE-2.0
		11	#
		12	# Unless required by applicable law or agreed to in writing, software
		13	# distributed under the License is distributed on an "AS IS" BASIS,
		14	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		15	# See the License for the specific language governing permissions and
		16	# limitations under the License.
		17	# </@LICENSE>
2		18
3	exec ./logs-to-c --count $*	19	=head1 NAME
		20
		21	fp-fn-statistics - Display statistics about the quality of scores
		22
		23	=head1 SYNOPSIS
		24
		25	fp-fn-statistics [options]
		26
		27	Options:
		28	-c,--cffile=path Use path as the rules directory
		29	-s,--scoreset=n Use scoreset n
		30	-l,--logfile=file Read in file instead of masses.log
		31	-t,--threshold=n Use a spam/ham threshold of n (default: 5)
		32	--lambda=n Use a lambda value of n
		33
		34	=head1 DESCRIPTION
		35
		36	B<fp-fn-statistics> first calculates the score each message from a
		37	masses.log would have under a new set of scores. It then aggregates
		38	the number of messages correctly and incorrectly found as spam and
		39	ham, and their average scores.
		40
		41	In addition, B<fp-fn-statistics> determines the "Total Cost Ratio" as
		42	a result of the false positives and negatives mentioned above. This
		43	calculation takes into the value of lambda, which represents the cost
		44	of recovering a false positive, where 1 indicates a message is tagged
		45	only, 9 means the message is mailed back to sender asking for a token
		46	(TMDA style) and 999 means a message is delted. The default, 5,
		47	represents the message being moved to an infrequently read folder.
		48
		49	=cut
		50
		51	use FindBin;
		52	use lib "$FindBin::Bin/../lib";
		53	use Mail::SpamAssassin::Masses;
		54	use Getopt::Long qw(:config bundling auto_help);
		55	use Pod::Usage;
		56	use strict;
		57	use warnings;
		58
		59	use vars qw{$opt_c $opt_l $opt_s $opt_t $opt_lambda};
		60
		61	GetOptions("c\|cffile=s@" => \$opt_c,
		62	"l\|logfile=s" => \$opt_l,
		63	"s\|scoreset=i" => \$opt_s,
		64	"t\|threshold=f" => \$opt_t,
		65	"lambda" => \$opt_lambda);
		66
		67	$opt_l \|\|= "masses.log";
		68
		69	if (!$opt_c \|\| !scalar(@$opt_c)) {
		70	# Try to read this in from the log, if possible
		71	open IN, $opt_l or die "Can't open $opt_l: $!";
		72	my $files = 0; # are we in the files section?
		73	while(<IN>) {
		74	if (!$files) {
		75	if (/^\# SVN revision:/) {
		76	$opt_c = [ "$FindBin::Bin/../rules" ];
		77	last;
		78	} elsif (/^\# Using configuration:$/) {
		79	$files = 1;
		80	}
		81	} elsif (/^\#\s+(.)\s$/) {
		82	push (@$opt_c, $1);
		83	} else {
84	# All done!
85	last;
86	}
87	}
88
89	if (!defined $opt_c) {
90	$opt_c = [ "$FindBin::Bin/../rules" ];
91	}
92
93	foreach my $file (@$opt_c) {
94	die "Can't read $file" unless -r $file;
95	}
96	}
97
98	$opt_t = (defined($opt_t) ? $opt_t : 5);
99	$opt_s \|\|= 0;
100	$opt_lambda \|\|= 5;
101
102	my $nybias = 10;
103
104
105	my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
106	scoreset => $opt_s, # ,,
107	logfile => $opt_l});
108
109	$masses->readlogs();
110
111	my $logs = $masses->get_logs();
112
113	my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore) = (0,0,0,0,0,0,0,0);
114
115	my $num_spam = $masses->get_num_spam();
116	my $num_ham = $masses->get_num_ham();
117	my $num_logs = $num_spam + $num_ham;
118
119	my $count = 0;
120
121	my $score;
122
123	foreach my $log (@$logs) {
124
125	$score = 0;
126	foreach my $test (@{$log->{tests_hit}}) {
127
128	next if ($test->{issubrule});
129	next if (!$test->{score});
130
131	$score += $test->{score};
132
133	}
134
135	if ($score >= $opt_t) {
136	if ($log->{isspam}) {
137	$ga_yy++;
138	$yyscore += $score;
139	}
140	else {
141	$ga_ny++;
142	$nyscore += $score;
143	}
144	} else {
145	if ($log->{isspam}) {
146	$ga_yn++;
147	$ynscore += $score;
148	}
149	else {
150	$ga_nn++;
151	$nnscore += $score;
152	}
153	}
154	}
155
156	$nybias = $nybias * ($num_spam / $num_ham);
157
158	my $fprate = ($ga_ny / $num_logs) * 100.0;
159	my $fnrate = ($ga_yn / $num_logs) * 100.0;
160
161	printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_t);
162	printf "# Correctly non-spam: %6d %4.2f%% (%4.2f%% of non-spam corpus)\n", $ga_nn,
163	($ga_nn / $num_logs) * 100.0, ($ga_nn / $num_ham) * 100.0;
164	printf "# Correctly spam: %6d %4.2f%% (%4.2f%% of spam corpus)\n" , $ga_yy,
165	($ga_yy / $num_logs) * 100.0, ($ga_yy / $num_spam) * 100.0;
166	printf "# False positives: %6d %4.2f%% (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny,
167	$fprate, ($ga_ny / $num_ham) * 100.0, $nyscore*$nybias;
168	printf "# False negatives: %6d %4.2f%% (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn,
169	$fnrate, ($ga_yn / $num_spam) * 100.0, $ynscore;
170
171	# convert to the TCR metrics used in the published lit
172	my $nspamspam = $ga_yy;
173	my $nspamlegit = $ga_yn;
174	my $nlegitspam = $ga_ny;
175	my $nlegitlegit = $ga_yn;
176	my $nlegit = $num_ham;
177	my $nspam = $num_spam;
178
179	my $werr = ($opt_lambda * $nlegitspam + $nspamlegit)
180	/ ($opt_lambda * $nlegit + $nspam);
181
182	my $werr_base = $nspam
183	/ ($opt_lambda * $nlegit + $nspam);
184
185	$werr \|\|= 0.000001; # avoid / by 0
186	my $tcr = $werr_base / $werr;
187
188	my $sr = ($nspamspam / $nspam) * 100.0;
189	my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
190	printf "# TCR: %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%% FP: %3.2f%% FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate;
191

Lines 19-48 Link Here

(-)masses/extract-message-from-mbox (-17 / +48 lines)
19	use bytes;	19	use bytes;
20		20
21	use vars qw {	21	use vars qw {
22	$opt_f $opt_h $opt_m $opt_H	22	$opt_h $opt_m
23	};	23	};
24		24
25	use Getopt::Std;
26	getopts("f:hmH");
27		25
28	sub usage {	26	use Getopt::Long qw(:config bundling auto_help);
29	die "extract-message-from-mbox [-f=file] [-m] [-H] offset	27	use Pod::Usage;
30		28
31	Extracts the message starting at offset from file (or stdin). Very	29	GetOptions("m\|mass-check" => \$opt_m, "h\|H\|headers" => \$opt_h);
32	useful in combination with mass-check logs and mboxes. If the -m
33	option is used, the input should be in \"mass-check\" format (as
34	output by mass-check). Use the -H option to just output headers.
35	";
36	}
37		30
38	usage() if($opt_h \|\| (!defined($ARGV[0]) && !$opt_m));	31	=head1 NAME
39	my $offset = $ARGV[0];
40		32
		33	extract-message-from-mbox - Extract a message from an mbox
		34
		35	=head1 SYNOPSIS
		36
		37	extract-message-from-mbox [--headers] <mbox>.<offset>
		38	extract-message-from-mbox --mass-check
		39
		40	Options:
		41	-h, --headers Display only message headers
		42	-m, --masscheck Read mass-check output from stdin
		43
		44	=head1 DESCRIPTION
		45
		46	B<extract-message-from-mbox> extracts the message from I<mbox>
		47	starting at the byte offset I<offset>. Very useful in combination with
		48	mass-check logs and mboxes. If the -m or --mass-check option is used,
		49	the input should be in "mass-check" format (as output by
		50	mass-check). Use the -H option to just output headers.
		51
		52	=head1 EXAMPLES
		53
		54	To show messages that hit the rule BAYES_99
		55
		56	grep BAYES_99 masses.log \| extract-message-from-mbox -m
		57
		58	To show the message indicated by "/path/to/my/mbox.1234"
		59
		60	extract-message-from-mbox /path/to/my/mbox.1234
		61
		62	=cut
		63
		64
		65
41	if($opt_m) {	66	if($opt_m) {
42	masscheck();	67	masscheck();
43	} else {	68	} else {
44	$opt_f \|\|= '&STDIN';	69	foreach my $message (@ARGV) {
45	extract($opt_f, $offset);	70	if ($message =~ /^(.*?)(?:\.(\d+))?$/) {
		71	extract($1, ($2 \|\| 0));
		72	}
		73	else {
		74	pod2usage("Argument must be of the form <mbox>.<offset>");
		75	}
		76	}
46	}	77	}
47		78
48	sub extract {	79	sub extract {
Lines 61-74 Link Here
61	$found++ if(/^From /);	92	$found++ if(/^From /);
62	last if($found == 3);	93	last if($found == 3);
63	print;	94	print;
64	last if ($opt_H && /^$/) # empty line? end of headers	95	last if ($opt_h && /^$/) # empty line? end of headers
65	}	96	}
66	}	97	}
67	}	98	}
68		99
69	sub masscheck {	100	sub masscheck {
70	while (<STDIN>) {	101	while (<STDIN>) {
71	my $mail = (split(/\s+/, $_))[2];	102	my $mail = (split(/\s+/, $_))[3];
72	$mail =~ tr/_/ /;	103	$mail =~ tr/_/ /;
73	if ($mail =~ /^(.*)\.(\d+)$/) {	104	if ($mail =~ /^(.*)\.(\d+)$/) {
74	extract($1, $2);	105	extract($1, $2);

Lines 16-272 Link Here

(-)masses/logs-to-c (-346 / +219 lines)
16	# limitations under the License.	16	# limitations under the License.
17	# </@LICENSE>	17	# </@LICENSE>
18		18
19	use Getopt::Long;	19	=head1 NAME
20	use vars qw($opt_cffile $opt_count $opt_lambda $opt_threshold
21	$opt_spam $opt_nonspam);
22		20
23	GetOptions("cffile=s", "count", "lambda=f", "threshold=f", "spam=s", "nonspam=s", "scoreset=i");	21	logs-to-c - Convert a mass-check log into perceptron format
24	my $argcffile = $opt_cffile;
25		22
26	my $justcount = 0;	23	=head1 SYNOPSIS
27	if ($opt_count) { $justcount = 1; }
28		24
29	my $threshold = 5;	25	logs-to-c [options]
30	if (defined $opt_threshold) { $threshold = $opt_threshold; }
31		26
32	$opt_spam \|\|= 'spam.log';	27	Options:
33	$opt_nonspam \|\|= 'ham.log';	28	-c,--cffile=path Use path as the rules directory
34	$opt_scoreset = 0 if ( !defined $opt_scoreset );	29	-s,--scoreset=n Use scoreset n
		30	-l,--logfile=file Read in file instead of masses.log
		31	-o,--outputdir Put output in the specified dir (default tmp/)
35		32
36	my $nybias = 10;	33	=head1 DESCRIPTION
37		34
38	# lambda value for TCR equation, indicating the "cost" of recovering	35	B<logs-to-c> will read the mass-check log F<masses.log> or as
39	# from an FP. The values are: 1 = tagged only, 9 = mailed back to	36	specified by the B<--logfile> option, and convert it into the format
40	# sender asking for token (TMDA style), 999 = deleted outright.	37	needed by the perceptron. This is a format that is simple for the
41	# We (SpamAssassin) use a default of 5, representing "moved to	38	perceptron to parse, but is not very readable to humans.
42	# infrequently-read folder".
43		39
44	my $lambda = 5;	40	By default, output will be put in the directory ./tmp/ unless another
45	if ($opt_lambda) { $lambda = $opt_lambda; }	41	directory is specified by the B<--outputdir> option. (Note: at the
		42	current time, this must be /tmp/ in order for the perceptron to
		43	compile properly.)
46		44
47	my %is_spam = ();	45	=head1 BUGS
48	my %tests_hit = ();
49	my %mutable_tests = ();
50		46
51	use vars qw(%rules %allrules);	47	Please report bugs to http://bugzilla.spamassassin.org/
52		48
53	readscores();	49	=head1 SEE ALSO
54		50
55	print "Reading per-message hit stat logs and scores...\n";	51	L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
56	my ($num_tests, $num_spam, $num_nonspam);
57	my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore);
58		52
59	readlogs();	53	=cut
60	read_ranges();
61		54
62	if ($justcount) {	55	use FindBin;
63	$nybias = $nybias*($num_spam / $num_nonspam);	56	use lib "$FindBin::Bin/../lib";
64	evaluate();	57	use Mail::SpamAssassin::Masses;
65	} else {	58	use Getopt::Long qw(:config bundling auto_help);
66	print "Writing logs and current scores as C code...\n";	59	use Pod::Usage;
67	writescores_c();	60	use strict;
68	}	61	use warnings;
69	exit 0;
70		62
		63	use vars qw{$opt_c $opt_l $opt_s $opt_o};
71		64
72	sub readlogs {	65	GetOptions("c\|cffile=s@" => \$opt_c,
73	my $count = 0;	66	"s\|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
74	$num_spam = $num_nonspam = 0;	67	"l\|logfile=s" => \$opt_l,
		68	"o\|output=s" => \$opt_o);
75		69
76	if ($justcount) {
77	$ga_yy = $ga_ny = $ga_yn = $ga_nn = 0;
78	$yyscore = $ynscore = $nyscore = $nnscore = 0.0;
79	}
80		70
81	foreach my $file ($opt_spam, $opt_nonspam) {	71	$opt_o \|\|= "./tmp/";
82	open (IN, "<$file");	72	if (!-d $opt_o) {
		73	mkdir $opt_o, 0777 or die "Can't mkdir $opt_o";
		74	}
83		75
84	while (<IN>) {	76	$opt_l \|\|= "masses.log";
85	next if /^\#/;
86	next if /^$/;
87	if($_ !~ /^.\s+([-\d]+)\s+\S+\s*/) { warn "bad line: $_"; next; }
88	my $hits = $1;
89	#my $foo = $_;
90	$_ = $'; s/(?:bayes\|time)=\S+//; s/,,+/,/g; s/^\s+//; s/\s+$//;
91		77
92	my $score = 0;	78	if (!$opt_c \|\| !scalar(@$opt_c)) {
93	my @tests = ();	79	# Try to read this in from the log, if possible
94	foreach my $tst (split (/,/, $_)) {	80	open IN, $opt_l or die "Can't open $opt_l: $!";
95	next if ($tst eq '');	81	my $files = 0; # are we in the files section?
96	if (!defined $scores{$tst}) {	82	while(<IN>) {
97	#warn "unknown test in $file, ignored: $tst\n";	83	if (!$files) {
98	next;	84	if (/^\# SVN revision:/) {
		85	$opt_c = [ "$FindBin::Bin/../rules" ];
		86	last;
		87	} elsif (/^\# Using configuration:$/) {
		88	$files = 1;
		89	}
		90	} elsif (/^\#\s+(.)\s$/) {
		91	push (@$opt_c, $1);
		92	} else {
		93	# All done!
		94	last;
99	}	95	}
		96	}
100		97
101	# Make sure to skip any subrules!	98	if (!defined $opt_c) {
102	next if ( $allrules{$tst}->{issubrule} );	99	$opt_c = [ "$FindBin::Bin/../rules" ];
		100	}
103		101
104	if ($justcount) {	102	foreach my $file (@$opt_c) {
105	$score += $scores{$tst};	103	die "Can't read $file" unless -r $file;
106	} else {
107	push (@tests, $tst);
108	}
109	}
110
111	if (!$justcount) {
112	$tests_hit{$count} = \@tests;
113	}
114
115	if ($file eq $opt_spam) {
116	$num_spam++;
117	if ($justcount) {
118	if ($score >= $threshold) {
119	$ga_yy++; $yyscore += $score;
120	} else {
121	$ga_yn++; $ynscore += $score;
122	}
123	} else {
124	$is_spam{$count} = 1;
125	}
126	} else {
127	$num_nonspam++;
128	if ($justcount) {
129	if ($score >= $threshold) {
130	#print "$score -- $foo";
131	$ga_ny++; $nyscore += $score;
132	} else {
133	$ga_nn++; $nnscore += $score;
134	}
135	} else {
136	$is_spam{$count} = 0;
137	}
138	}
139	$count++;
140	}	104	}
141	close IN;
142	}
143	$num_tests = $count;
144	}	105	}
145		106
		107	# ignore rules that are subrules -- we don't generate scores for them...
146		108
147	sub readscores {	109	# Note: this will cause a difference over the old logs-to-c since rank
148	if (!defined $argcffile) { $argcffile = "../rules"; }	110	# is dependent on the frequencies of all rules, not just non-subrules
149	print "Reading scores from \"$argcffile\"...\n";
150	system ("./parse-rules-for-masses -d \"$argcffile\" -s $opt_scoreset") and die;
151	require "./tmp/rules.pl";
152	%allrules = %rules; # ensure it stays global
153	}
154		111
		112	my $greprules = sub { return 0 if $_[1]->{issubrule}; return 1; };
		113
		114	$opt_s \|\|= 0; # \|
		115
		116	my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
		117	scoreset => $opt_s, # ,,
		118	logfile => $opt_l,
		119	greprules => $greprules });
		120
		121	$masses->readlogs();
		122	$masses->do_score_ranges();
		123
		124	my $rules = $masses->get_rules_array();
		125	my $logs = $masses->get_logs();
		126
		127	my @index_to_rule;
		128	my $num_spam = $masses->get_num_spam();
		129	my $num_ham = $masses->get_num_ham();
		130
		131	# This is misleading -- num_tests is really num_msgs
		132	my $num_tests = $num_spam + $num_ham;
		133
		134
		135	# Write logs and scores as C code
		136	writescores_c();
		137	writetests_c();
		138
		139
155	sub writescores_c {	140	sub writescores_c {
156	my $output = '';	141
157	my $size = 0;
158	my $mutable = 0;	142	my $mutable = 0;
159	my $i;	143	my $output = '';
		144	my $count = 0;
		145	my $score = 0;
160		146
161	# jm: now, score-ranges-from-freqs has tflags to work from, so	147	foreach my $rule (sort {($b->{ismutable} <=> $a->{ismutable}) \|\|
162	# it will always list all mutable tests.	148	($a->{name} cmp $b->{name}) } @$rules) {
163		149
164	@index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) \|\|	150	$score = $rule->{score};
165	($mutable_tests{$b} <=> $mutable_tests{$a}) \|\|
166	($a cmp $b)} (keys %scores);
167	my $max_hits_per_msg = 0;
168	for ($file = 0; $file < $num_tests; $file++) {
169	my(@hits) =
170	grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (@{$tests_hit{$file}});
171	if ((scalar(@hits)+1) > $max_hits_per_msg) {
172	$max_hits_per_msg = scalar(@hits)+1;
173	}
174	}
175		151
176	for ($i = 0; $i <= $#index_to_rule; $i++) {	152	# ignored rules (i.e. no scores)
177	my $name = $index_to_rule[$i];	153	next unless $score;
178	$rule_to_index{$name} = $i;
179		154
180	if ($ignored_rule{$name}) { next; }	155	# also ignore rules with score range 0
		156	next if (!$rule->{range_lo} && !$rule->{range_hi});
181		157
182	if ($mutable_tests{$name} == 0) {	158	# Set an index
183	$range_lo{$name} = $range_hi{$name} = $scores{$name};	159	$rule->{index} = $count;
184	} else {	160	$index_to_rule[$count] = $rule; # add the reference to the array
		161
		162	if ($rule->{ismutable}) {
185	$mutable++;	163	$mutable++;
186	if ($range_lo{$name} > $range_hi{$name}) {	164	if ($score > $rule->{range_hi}) {
187	($range_lo{$name},$range_hi{$name}) =	165	$score = $rule->{range_hi} - 0.001;
188	($range_hi{$name},$range_lo{$name});	166	} elsif ($score < $rule->{range_lo}) {
		167	$score = $rule->{range_lo} + 0.001;
189	}	168	}
190	#$range_lo{$name} \|\|= 0.1;
191	#$range_hi{$name} \|\|= 1.5;
192	}	169	}
		170	# These should all be set properly if not mutable
		171	# score = range_lo = range_hi
		172	else {
		173	warn "hi != lo for " . $rule->{name} . "!" if $rule->{range_lo} != $rule->{range_hi};
		174	$score = $rule->{range_hi} = $rule->{range_lo};
		175	}
193		176
194	$output .= ".".$i."\n".	177	$output .= "." . $count . "\n" .
195	"n".$name."\n".	178	"n" . $rule->{name} . "\n" .
196	"b".$scores{$name}."\n".	179	"b" . $score . "\n" .
197	"m".$mutable_tests{$name}."\n".	180	"m" . $rule->{ismutable} . "\n" .
198	"l".$range_lo{$name}."\n".	181	"l" . $rule->{range_lo} . "\n" .
199	"h".$range_hi{$name}."\n";	182	"h" . $rule->{range_hi} . "\n";
200	$size++;	183
		184	$count++;
		185
201	}	186	}
202		187
		188	# Output this
203		189
204	open (DAT, ">tmp/scores.data");	190	open (DAT, ">$opt_o/scores.data");
205	print DAT "N$size\n", "M$mutable\n", # informational only	191	print DAT "N$count\n", "M$mutable\n"; # informational
206	$output;	192	print DAT $output;
207	close DAT;	193	close DAT;
208		194
209	open (OUT, ">tmp/scores.h");	195	open (OUT, ">$opt_o/scores.h");
210	print OUT "	196	print OUT <<EOF;
211	#include <stdio.h>	197	#include <stdio.h>
212	#include <string.h>	198	#include <string.h>
213	#include <stdlib.h>	199	#include <stdlib.h>
214		200
215	int num_scores = $size;	201	int num_scores = $count;
216	int num_mutable = $mutable;	202	int num_mutable = $mutable;
217	unsigned char is_mutable[$size];	203	unsigned char is_mutable[$count];
218	double range_lo[$size];	204	double range_lo[$count];
219	double range_hi[$size];	205	double range_hi[$count];
220	double bestscores[$size];	206	double bestscores[$count];
221	char *score_names[$size];	207	char *score_names[$count];
222	double tmp_scores[$size][2];	208	double tmp_scores[$count][2];
223	unsigned char ny_hit[$mutable];	209	unsigned char ny_hit[$mutable];
224	unsigned char yn_hit[$mutable];	210	unsigned char yn_hit[$mutable];
225		211
226	double lookup[$mutable];	212	double lookup[$mutable];
227		213
228	/* readscores() is defined in tests.h */	214	/* readscores() is defined in tests.h */
		215	EOF
229		216
230	";
231	close OUT;	217	close OUT;
232		218
233	writetests_c($max_hits_per_msg); # make sure $rule_to_index is around
234	}	219	}
235		220
		221
236	sub writetests_c {	222	sub writetests_c {
237	my $max_hits_per_msg = $_[0];
238		223
239	my(%uniq_files) = ();	224	my $max_hits_per_msg = 0;
240	my(%count_keys) = ();	225	my @goodtests;
241	my(%file_key) = ();	226	my %uniq_logs;
		227	my $uniq_key;
242		228
243	my $file;	229	my $i = 0;
244		230
245	for ($file = 0; $file < $num_tests; $file++)	231	# This will "compress" the logs so that one log entry can have a
246	{	232	# "count" of n indicating it reprents n similar messages
247	my $uniq_key = $is_spam{$file} . " ";
248		233
249	my(@good_tests) =	234	foreach my $log (@$logs) {
250	grep {length($_) && (! $ignored_rule{$_}) &&
251	(defined($rule_to_index{$_}))} (@{ $tests_hit{$file} });
252		235
253	@good_tests = sort {$a <=> $b} (map {$rule_to_index{$_}} (@good_tests));	236	(@goodtests) = grep {exists($_->{index})} (@{$log->{tests_hit}});
		237	@goodtests = sort {$a <=> $b} map {$_->{index}} @goodtests;
254		238
255	$uniq_key .= join(" ",@good_tests);	239	if($max_hits_per_msg < scalar(@goodtests)) {
		240	$max_hits_per_msg = scalar(@goodtests);
		241	}
256		242
257	if (exists($count_keys{$uniq_key})) {	243	$uniq_key = $log->{isspam} ? "s" : "";
258	$count_keys{$uniq_key}++;	244	$uniq_key .= join(" ", @goodtests);
		245
		246
		247	# The %count_keys hash's entries will be the log info for each unique log
		248	# $log->{count} is increased to indicate similar logs
		249
		250	if (exists($uniq_logs{$uniq_key})) {
		251	$uniq_logs{$uniq_key}->{count}++;
259	} else {	252	} else {
260	$count_keys{$uniq_key} = 1;	253	$uniq_logs{$uniq_key} = $log;
261	$file_key{$file} = $uniq_key;	254	$uniq_logs{$uniq_key}->{count} = 1;
262	$uniq_files{$file} = scalar(keys(%count_keys)) - 1;
263	}	255	}
		256
264	}	257	}
265		258
266	my $num_nondup = scalar(keys(%uniq_files));	259	my $num_nondup = scalar(keys %uniq_logs);
267		260
268	open (TOP, ">tmp/tests.h");	261	open TOP, ">$opt_o/tests.h";
269	print TOP "	262	print TOP <<EOF;
270	#include <stdio.h>	263	#include <stdio.h>
271	#include <string.h>	264	#include <string.h>
272	#include <stdlib.h>	265	#include <stdlib.h>
Lines 274-280 Link Here
274	int num_tests = $num_tests;	267	int num_tests = $num_tests;
275	int num_nondup = $num_nondup;	268	int num_nondup = $num_nondup;
276	int num_spam = $num_spam;	269	int num_spam = $num_spam;
277	int num_nonspam = $num_nonspam;	270	int num_nonspam = $num_ham;
278	int max_hits_per_msg = $max_hits_per_msg;	271	int max_hits_per_msg = $max_hits_per_msg;
279	unsigned char num_tests_hit[$num_nondup];	272	unsigned char num_tests_hit[$num_nondup];
280	unsigned char is_spam[$num_nondup];	273	unsigned char is_spam[$num_nondup];
Lines 282-477 Link Here
282	double scores[$num_nondup];	275	double scores[$num_nondup];
283	double tmp_total[$num_nondup];	276	double tmp_total[$num_nondup];
284	int tests_count[$num_nondup];	277	int tests_count[$num_nondup];
		278	EOF
285		279
286	";	280
287	$_ = join ('', <DATA>);	281	print TOP join('', <DATA>);
288	print TOP $_;
289	close TOP;	282	close TOP;
290		283
291	open (DAT, ">tmp/tests.data");
292		284
293	foreach $file (sort {$a <=> $b} (keys %uniq_files)) {	285	open (DAT, ">$opt_o/tests.data");
294	print DAT ".".$uniq_files{$file}."\n";
295		286
296	my $out = '';	287	my $out;
297	$out .= "s".$is_spam{$file}."\n";	288	my $base_score;
		289	my $num_tests_hit;
298		290
299	my $base_score = 0;	291	$i = 0;
300	my $num_tests_hit = 0;	292	foreach my $log (values %uniq_logs) {
301	foreach my $test (@{$tests_hit{$file}}) {	293	$out = '';
302	if ($test eq '') { next; }	294	$base_score = $num_tests_hit = 0;
303		295
304	if ($ignored_rule{$test}) {	296	print DAT "." . $i . "\n";
305	warn "ignored rule $test got a hit in $file!\n";	297
306	next;	298	$out .= "s" . ( ($log->{isspam})? 1 : 0 ) . "\n";
		299
		300	foreach my $test (@{$log->{tests_hit}}) {
		301	if (!$test->{score}) {
		302	# Don't really know why this happens, but the old logs-to-c
		303	#did it too
		304
		305	warn "ignored rule " . $test->{name} . " got a hit!";
		306	next;
307	}	307	}
308		308
309	if (!defined $rule_to_index{$test}) {	309	if (!$test->{range_lo} && !$test->{range_hi}) {
310	warn "test with no C index: $test\n";	310	# We ignored this rule
311	next;	311	next;
312	}	312	}
313		313
314	if ($mutable_tests{$test}) {	314	# debugging...
315	$num_tests_hit++;	315	if (!defined $test->{index}) {
316	$out .= "t".$rule_to_index{$test}."\n";	316	warn "test with no index";
317
318	if ($num_tests_hit >= $max_hits_per_msg) {
319	die "Need to increase \$max_hits_per_msg";
320	}	317	}
321	} else {
322	$base_score += $scores{$test};
323	}
324	}
325		318
326	$out .= "b" . $base_score . "\n"; # score to add in for non-mutable tests	319	if ($test->{ismutable}) {
327	$out .= "c" . $count_keys{$file_key{$file}} . "\n";	320	$num_tests_hit++;
		321	$out .= "t".$test->{index}."\n";
328		322
329	print DAT "n".$num_tests_hit."\n".$out;	323	if ($num_tests_hit >= $max_hits_per_msg) {
330	}	324	die "\$max_hits_per_msg not big enough!";
331	close DAT;	325	}
332	}
333		326
334	sub read_ranges {	327	}
335	if (!-f 'tmp/ranges.data') {	328	else {
336	system ("make tmp/ranges.data");	329	$base_score += $test->{score};
337	}	330	}
338		331
339	# read ranges, and mutableness, from ranges.data.
340	open (IN, "<tmp/ranges.data")
341	or die "need to run score-ranges-from-freqs first!";
342
343	my $count = 0;
344	while (<IN>) {
345	/^(\S+) (\S+) (\d+) (\S+)$/ or next;
346	my $t = $4;
347	$range_lo{$t} = $1+0;
348	$range_hi{$t} = $2+0;
349	my $mut = $3+0;
350
351	if ($allrules{$t}->{issubrule}) {
352	$ignored_rule{$t} = 1;
353	$mutable_tests{$t} = 0;
354	next;
355	}	332	}
356	if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) {
357	#warn "ignored rule: score and range == 0: $t\n";
358	$ignored_rule{$t} = 1;
359	$mutable_tests{$t} = 0;
360	next;
361	}
362		333
363	$ignored_rule{$t} = 0;	334	$out .= "b" . $base_score . "\n"; # score to add for non-mutable tests
364	$index_to_rule[$count] = $t;	335	$out .= "c" . $log->{count} . "\n"; # number of identical logs
365	$count++;
366		336
367	if (!$mut) {	337	print DAT "n" . $num_tests_hit . "\n" . $out;
368	$mutable_tests{$t} = 0;
369	} elsif ($range_lo{$t} == $range_hi{$t}) {
370	$mutable_tests{$t} = 0;
371	} elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
372	$mutable_tests{$t} = 0;
373	} else {
374	$mutable_tests{$t} = 1;
375	}
376	unless ($mutable_tests{$t} \|\| $scores{$t}) {
377	$ignored_rule{$t} = 1;
378	}
379	}
380	close IN;
381		338
382	# catch up on the ones missed; seems to be userconf or 0-hitters mostly.	339	$i++;
383	foreach my $t (sort keys %allrules) {
384	next if (exists($range_lo{$t}));
385	if ($allrules{$t}->{issubrule}) {
386	$ignored_rule{$t} = 1;
387	$mutable_tests{$t} = 0;
388	next;
389	}
390	$ignored_rule{$t} = 0;
391	unless (exists($mutable_tests{$t}) &&
392	($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) {
393	$mutable_tests{$t} = 0;
394	}
395	unless ($mutable_tests{$t} \|\| $scores{$t}) {
396	$ignored_rule{$t} = 1;
397	}
398	$index_to_rule[$count] = $t;
399	$count++;
400	}	340	}
401	foreach my $t (keys %range_lo) {
402	next if ($ignored_rule{$t});
403	if ($mutable_tests{$t}) {
404	if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
405	$scores{$t} = -1;
406	} elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
407	($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
408	$scores{$t} = -0.01;
409	}
410	if ($scores{$t} >= $range_hi{$t}) {
411	$scores{$t} = $range_hi{$t} - 0.001;
412	} elsif ($scores{$t} <= $range_lo{$t}) {
413	$scores{$t} = $range_lo{$t} + 0.001;
414	}
415	} else {
416	if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
417	next;
418	} elsif ($range_lo{$t} == $range_hi{$t}) {
419	$scores{$t} = $range_lo{$t};
420	next;
421	}
422	if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
423	$scores{$t} = -1;
424	} elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
425	($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
426	$scores{$t} = -0.01;
427	}
428	if ($scores{$t} > $range_hi{$t}) {
429	$scores{$t} = $range_hi{$t};
430	} elsif ($scores{$t} < $range_lo{$t}) {
431	$scores{$t} = $range_lo{$t};
432	}
433	}
434	}
435	}
436		341
437	sub evaluate {	342	close DAT;
438	my $fprate = ($ga_ny / $num_tests) * 100.0;
439	my $fnrate = ($ga_yn / $num_tests) * 100.0;
440		343
441	printf ("\n# SUMMARY for threshold %3.1f:\n", $threshold);
442	printf "# Correctly non-spam: %6d %4.2f%% (%4.2f%% of non-spam corpus)\n", $ga_nn,
443	($ga_nn / $num_tests) * 100.0, ($ga_nn / $num_nonspam) * 100.0;
444	printf "# Correctly spam: %6d %4.2f%% (%4.2f%% of spam corpus)\n" , $ga_yy,
445	($ga_yy / $num_tests) * 100.0, ($ga_yy / $num_spam) * 100.0;
446	printf "# False positives: %6d %4.2f%% (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny,
447	$fprate, ($ga_ny / $num_nonspam) * 100.0, $nyscore*$nybias;
448	printf "# False negatives: %6d %4.2f%% (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn,
449	$fnrate, ($ga_yn / $num_spam) * 100.0, $ynscore;
450		344
451	# convert to the TCR metrics used in the published lit
452	my $nspamspam = $ga_yy;
453	my $nspamlegit = $ga_yn;
454	my $nlegitspam = $ga_ny;
455	my $nlegitlegit = $ga_yn;
456	my $nlegit = $num_nonspam;
457	my $nspam = $num_spam;
458
459	my $werr = ($lambda * $nlegitspam + $nspamlegit)
460	/ ($lambda * $nlegit + $nspam);
461
462	my $werr_base = $nspam
463	/ ($lambda * $nlegit + $nspam);
464
465	$werr \|\|= 0.000001; # avoid / by 0
466	my $tcr = $werr_base / $werr;
467
468	my $sr = ($nspamspam / $nspam) * 100.0;
469	my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
470	printf "# TCR: %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%% FP: %3.2f%% FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate;
471	}	345	}
472		346
473	__DATA__
474		347
		348	__DATA__
475	void loadtests (void) {	349	void loadtests (void) {
476	FILE *fin = fopen ("tmp/tests.data", "r");	350	FILE *fin = fopen ("tmp/tests.data", "r");
477	char buf[256];	351	char buf[256];
Lines 557-560 Link Here
557		431
558	printf ("Read scores for %d tests.\n", num_scores);	432	printf ("Read scores for %d tests.\n", num_scores);
559	}	433	}
560

Lines 7-15 Link Here

(-)masses/post-ga-analysis.pl (-27 / +11 lines)
7	my %scores;	7	my %scores;
8	my %rulehit;	8	my %rulehit;
9		9
10	open(SPAM, "<spam.log");	10	open(LOGS, "<masses.log");
11	open(NONSPAM, "<nonspam.log");	11	open(SCORES, "<perceptron.scores");
12	open(SCORES, "<newscores");
13		12
14	while(<SCORES>)	13	while(<SCORES>)
15	{	14	{
Lines 22-32 Link Here
22		21
23	close(SCORES);	22	close(SCORES);
24		23
25	while(<SPAM>)	24	while(<LOGS>)
26	{	25	{
27	next if /^#/;	26	next if /^#/;
28	/.\s+[-0-9]\s+[^\s]+\s+([^\s])(\s+?:(?:bayes\|time)=\S+)\s*?$/;	27	/(.)\s+.\s+[-0-9]\s+[^\s]+\s+([^\s])(\s+?:(?:bayes\|time)=\S+)\s*?$/;
29	my @rules=split /,/,$1;	28	my $class = $1;
		29	my @rules=split /,/,$2;
30	my $score = 0.0;	30	my $score = 0.0;
31	foreach $rule (@rules)	31	foreach $rule (@rules)
32	{	32	{
Lines 35-41 Link Here
35	$rulehit{$rule}++;	35	$rulehit{$rule}++;
36	}	36	}
37		37
38	if($score < 5)	38	if($class eq "s" && $score < 5)
39	{	39	{
40	foreach $rule (@rules)	40	foreach $rule (@rules)
41	{	41	{
Lines 44-70 Link Here
44	}	44	}
45	$nfn++;	45	$nfn++;
46	}	46	}
47	}	47	if($class eq "h" && score >= 5)
48
49	close(SPAM);
50
51	while(<NONSPAM>)
52	{
53	next if /^#/;
54	/.\s+[-0-9]\s+[^\s]+\s+([^\s])\s*$/;
55	next unless defined($1);
56
57	my @rules=split /,/,$1;
58	my $score = 0.0;
59	foreach $rule (@rules)
60	{	48	{
61	next unless (defined ($scores{$rule}));
62	$score += $scores{$rule};
63	$rulehit{$rule}++;
64	}
65
66	if($score >= 5)
67	{
68	foreach $rule (@rules)	49	foreach $rule (@rules)
69	{	50	{
70	next unless (defined ($scores{$rule}));	51	next unless (defined ($scores{$rule}));
Lines 72-79 Link Here
72	}	53	}
73	$nfp++;	54	$nfp++;
74	}	55	}
		56
75	}	57	}
76		58
		59	close(LOGS);
		60
77	@fpk = sort { $falsepos{$b}/($rulehit{$b}\|\|0.0001) <=> $falsepos{$a}/($rulehit{$a}\|\|0.00001) } keys %falsepos;	61	@fpk = sort { $falsepos{$b}/($rulehit{$b}\|\|0.0001) <=> $falsepos{$a}/($rulehit{$a}\|\|0.00001) } keys %falsepos;
78		62
79	print "COMMON FALSE POSITIVES: ($nfp total)\n-----------------------\n\n";	63	print "COMMON FALSE POSITIVES: ($nfp total)\n-----------------------\n\n";

Line 0 Link Here

(-)masses/convert-old-logs-to-new (+15 lines)
		1	#!/bin/sh -e
		2
		3	cat spam.log \| perl -ne's/^Y/s s/; s/^\./s h/; print unless /^\#/;' \
		4	> spam.log.sorted
		5
		6	cat ham.log \| perl -ne's/^Y/h s/; s/^\./h h/; print unless /^\#/;' \
		7	> ham.log.sorted
		8
		9	# sort by time
		10
		11	echo \# SVN revision: > masses.log
		12
		13	sort --field-separator='=' -n -k2,2 --merge spam.log.sorted ham.log.sorted \
		14	>> masses.log
		15
0	+ *	16	+ *

Lines 1-251 Link Here

(-)masses/score-ranges-from-freqs (-251 lines)
1	#!/usr/bin/perl -w
2	#
3	# <@LICENSE>
4	# Copyright 2004 Apache Software Foundation
5	#
6	# Licensed under the Apache License, Version 2.0 (the "License");
7	# you may not use this file except in compliance with the License.
8	# You may obtain a copy of the License at
9	#
10	# http://www.apache.org/licenses/LICENSE-2.0
11	#
12	# Unless required by applicable law or agreed to in writing, software
13	# distributed under the License is distributed on an "AS IS" BASIS,
14	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15	# See the License for the specific language governing permissions and
16	# limitations under the License.
17	# </@LICENSE>
18
19	# (rough) graphic demo of this algorithm:
20	# 0.0 = -limit [......] 0 ........ limit
21	# 0.25 = -limit ..[..... 0 .]...... limit
22	# 0.5 = -limit ....[... 0 ...].... limit
23	# 0.75 = -limit ......[. 0 .....].. limit
24	# 1.0 = -limit ........ 0 [......] limit
25	my $sliding_window_limits = 4.8; # limits = [-$range, +$range]
26	my $sliding_window_size = 5.5; # scores have this range within limits
27
28	# 0.0 = -limit [......] 0 ........ limit
29	# 0.25 = -limit ....[... 0 ]....... limit
30	# 0.5 = -limit ......[. 0 .]...... limit (note: tighter)
31	# 0.75 = -limit .......[ 0 ...].... limit
32	# 1.0 = -limit ........ 0 [......] limit
33	my $shrinking_window_lower_base = 0.00;
34	my $shrinking_window_lower_range = 1.00; # *ratio, added to above
35	my $shrinking_window_size_base = 1.00;
36	my $shrinking_window_size_range = 1.00; # *ratio, added to above
37
38	my $use_sliding_window = 0;
39
40	my $argcffile = shift @ARGV;
41	my $scoreset = shift @ARGV;
42	$scoreset = 0 if ( !defined $scoreset );
43
44	if (defined ($argcffile) && $argcffile eq '-test') {
45	# use this to debug the ranking -> score-range mapping:
46	for $rat (0.0, 0.25, 0.5, 0.75, 1.0) {
47	my ($lo, $hi); if ($use_sliding_window) {
48	($lo, $hi) = sliding_window_ratio_to_range($rat);
49	} else {
50	($lo, $hi) = shrinking_window_ratio_to_range($rat);
51	}
52	warn "test: $rat => [ $lo $hi ]\n";
53	} exit;
54	}
55
56	my %freq_spam = ();
57	my %freq_nonspam = ();
58
59	my $num_spam;
60	my $num_nonspam;
61	my $num_total;
62
63	my %mutable_tests = ();
64	my %ranking = ();
65	my %soratio = ();
66	my %is_nice = ();
67
68	if (!defined $argcffile) { $argcffile = "../rules"; }
69	system ("./parse-rules-for-masses -d \"$argcffile\" -s $scoreset") and die;
70	if (-e "tmp/rules.pl") {
71	# Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem
72	require "./tmp/rules.pl";
73	}
74	else {
75	die "parse-rules-for-masses had no error but no tmp/rules.pl!?!";
76	}
77
78	while (<>) {
79	/^\s([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+\S+\s+(.+)\s$/ or next;
80
81	my $overall = $1+0;
82	my $spam = $2+0;
83	my $nonspam = $3+0;
84	my $soratio = $4+0;
85	my $ranking = $5+0;
86	my $test = $6;
87
88	if ($test eq '(all messages)') {
89	$num_spam = $spam;
90	$num_nonspam = $nonspam;
91	$num_total = $spam+$nonspam;
92	next;
93	}
94	next if ($test eq '(all messages as %)');
95
96	if (!defined ($rules{$test})) {
97	warn "rule $test no longer exists; ignoring\n";
98	next;
99	}
100
101	$freq{$test} = $overall;
102	$freq_spam{$test} = $spam;
103	$freq_nonspam{$test} = $nonspam;
104
105	my $tflags = $rules{$test}->{tflags}; $tflags \|\|= '';
106	if ($tflags =~ /\buserconf\b/ \|\|
107	( ($scoreset % 2) == 0 && $tflags =~ /\bnet\b/ )) {
108	$mutable_tests{$test} = 0;
109	} else {
110	$mutable_tests{$test} = 1;
111	}
112	if ($tflags =~ m/\bnice\b/i) {
113	$is_nice{$test} = 1;
114	} else {
115	$is_nice{$test} = 0;
116	}
117
118	if ($overall < 0.01) { # less than 0.01% of messages were hit
119	$mutable_tests{$test} = 0;
120	$soratio{$test} = 0.5;
121	$ranking{$test} = 0.0;
122	$rules{$test}->{score} = 0; # tvd - disable these rules automagically
123
124	} else {
125	$soratio{$test} = $soratio;
126	$ranking{$test} = $ranking;
127	}
128	}
129
130	if ( ! mkdir "tmp", 0755 ) {
131	warn "Couldn't create tmp directory!: $!\n";
132	}
133
134	open (OUT, ">tmp/ranges.data");
135	foreach my $test (sort { $ranking{$b} <=> $ranking{$a} } keys %freq) {
136	if (!defined ($rules{$test})) {
137	warn "no rule $test";
138	print OUT ("0 0 0 $test\n");
139	next;
140	}
141
142	my $overall = $freq{$test};
143	my $spam = $freq_spam{$test};
144	my $nonspam = $freq_nonspam{$test};
145	my $soratio = $soratio{$test};
146	my $ranking = $ranking{$test};
147	my $mutable = $mutable_tests{$test};
148
149	if (!$mutable \|\| $rules{$test}->{score} == 0) { # didn't look for score 0 - tvd
150	printf OUT ("%3.3f %3.3f 0 $test\n",
151	$rules{$test}->{score},
152	$rules{$test}->{score});
153	next;
154	}
155
156	# 0.0 = best nice, 1.0 = best nonnice
157	if ($is_nice{$test}) {
158	$ranking = .5 - ($ranking / 2);
159	} else {
160	$ranking = .5 + ($ranking / 2);
161	}
162
163	my ($lo, $hi);
164	if ($use_sliding_window) {
165	($lo, $hi) = sliding_window_ratio_to_range($ranking);
166	} else {
167	($lo, $hi) = shrinking_window_ratio_to_range($ranking);
168	}
169
170	# tvd
171	my $tflags = $rules{$test}->{tflags}; $tflags \|\|= '';
172	if ( $is_nice{$test} && ( $ranking < .5 ) ) { # proper nice rule
173	if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score # -5.4
174	$lo *=1.8;
175	}
176	elsif ($soratio <= 0.05 && $nonspam > 0.5) { # let good rules be larger if they want to, -4.5
177	$lo *= 1.5;
178	}
179
180	$hi = ($soratio == 0) ? $lo :
181	($soratio <= 0.005 ) ? $lo/1.1 :
182	($soratio <= 0.010 && $nonspam > 0.2) ? $lo/2.0 :
183	($soratio <= 0.025 && $nonspam > 1.5) ? $lo/10.0 :
184	0;
185
186	if ( $soratio >= 0.35 ) { # auto-disable bad rules
187	($lo,$hi) = (0,0);
188	}
189	}
190	elsif ( !$is_nice{$test} && ( $ranking >= .5 ) ) { # proper spam rule
191	if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score
192	$hi *=1.8;
193	}
194	elsif ( $soratio >= 0.99 && $spam > 1.0 ) {
195	$hi *= 1.5; # let good rules be larger if they want to
196	}
197
198	$lo = ($soratio == 1) ? $hi:
199	($soratio >= 0.995 ) ? $hi/4.0 :
200	($soratio >= 0.990 && $spam > 1.0) ? $hi/8.0 :
201	($soratio >= 0.900 && $spam > 10.0) ? $hi/24.0 :
202	0;
203
204	if ( $soratio <= 0.65 ) { # auto-disable bad rules
205	($lo,$hi) = (0,0);
206	}
207	}
208	else { # rule that has bad nice setting
209	($lo,$hi) = (0,0);
210	}
211	$mutable = 0 if ( $hi == $lo );
212
213	printf OUT ("%3.1f %3.1f $mutable $test\n", $lo, $hi);
214	}
215	close OUT;
216	exit;
217
218	sub sliding_window_ratio_to_range {
219	my $ratio = shift;
220	my $lo = -$sliding_window_limits + ($sliding_window_size * $ratio);
221	my $hi = +$sliding_window_limits - ($sliding_window_size * (1-$ratio));
222	if ($lo > $hi) { # ???
223	($lo,$hi) = ($hi,$lo);
224	}
225	($lo, $hi);
226	}
227
228	sub shrinking_window_ratio_to_range {
229	my $ratio = shift;
230	my $is_nice = 0;
231	my $adjusted = ($ratio -.5) * 2; # adj [0,1] to [-1,1]
232	if ($adjusted < 0) { $is_nice = 1; $adjusted = -$adjusted; }
233
234	#$adjusted /= 1.5 if ( $ratio < 0.95 && $ratio > 0.15 ); # tvd
235
236	my $lower = $shrinking_window_lower_base
237	+ ($shrinking_window_lower_range * $adjusted);
238	my $range = $shrinking_window_size_base
239	+ ($shrinking_window_size_range * $adjusted);
240	my $lo = $lower;
241	my $hi = $lower + $range;
242	if ($is_nice) {
243	my $tmp = $hi; $hi = -$lo; $lo = -$tmp;
244	}
245	if ($lo > $hi) { # ???
246	($lo,$hi) = ($hi,$lo);
247	}
248
249	($lo, $hi);
250	}
251

Lines 17-54 Link Here

(-)masses/find-extremes (-153 / +182 lines)
17	# limitations under the License.	17	# limitations under the License.
18	# </@LICENSE>	18	# </@LICENSE>
19		19
20	use Getopt::Std;
21	getopts("l:L:h");
22		20
		21	use FindBin;
		22	use lib "$FindBin::Bin/../lib";
		23	use Mail::SpamAssassin::Masses;
		24	use Getopt::Long qw(:config bundling auto_help);
		25	use Pod::Usage;
		26	use strict;
		27	use warnings;
		28
23	use vars qw {	29	use vars qw {
24	$opt_h $opt_l $opt_L	30	$opt_c $opt_s $opt_l $opt_L $opt_inclang
25	};	31	};
26		32
27	sub usage {	33	GetOptions("c\|cffile=s@" => \$opt_c,
28	die "find-extremes [-l LC] [-L LC] [spam log] [nonspam log]	34	"s\|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
		35	"l\|logfile=s" => \$opt_l,
		36	"L\|language=s" => \$opt_L,
		37	"include-language=s" => \$opt_inclang);
29		38
30	-l LC also print language specific rules for lang code LC (or 'all')
31	-L LC only print language specific rules for lang code LC (or 'all')
32		39
33	options -l and -L are mutually exclusive.
34		40
35	if either the spam or and nonspam logs are unspecified, the defaults	41	my $lower = 1;
36	are \"spam.log\" and \"nonspam.log\" in the cwd.	42	#$threshold = 5;
		43	my $higher = 9;
		44	my $min_expected = 2; # Should not be set to more than 5 or less than 2
37		45
38	";	46
		47	=head1 NAME
		48
		49	find-extremes - Determine which rules are most likely to cause false positives/negatives.
		50
		51	=head1 SYNOPSIS
		52
		53	hit-frequencies [options]
		54
		55	Options:
		56	-c,--cffile=path Use path as the rules directory
		57	-s,--scoreset=n Use scoreset n
		58	-l,--logfile=file Read in file instead of masses.log
		59	-L,--language=lc Only print language specific tests for specified lang code (try 'all')
		60	--include-language=lc Also print language specific tests for specified lang code (try 'all')
		61
		62	=head1 DESCRIPTION
		63
		64	B<hit-frequencies> will read the mass-check log F<masses.log> or the
		65	log given by the B<--logfile> option. By default, B<hit-frequencies>
		66	will assume the proper values for B<--cffile> based on the header of
		67	the masses.log. The output will include the following columns:
		68
		69	=over 4
		70
		71	=item RULE
		72
		73	=item CHISQUARE
		74
		75	=item RATIO_FALSEPOS
		76
		77	=item OVER_FALSEPOS
		78
		79	=item FREQ_OVER
		80
		81	=back
		82
		83	=head1 BUGS
		84
		85	This script may or may not work as designed - it probably needs some
		86	tweaking, and I probably introduced a bug into it while re-writing for
		87	the new Masses stuff.
		88
		89	=head1 NOTES
		90
		91	This script is poorly documented. Patches welcome.
		92
		93	=cut
		94
		95
		96	$opt_s = 0 unless defined $opt_s;
		97
		98	my $ok_lang = lc ( $opt_inclang \|\| $opt_L \|\| '');
		99	$ok_lang = '.' if ($ok_lang eq 'all');
		100
		101	my $greprules = sub {
		102	my ($name, $rule) = @_;
		103
		104	return 0 if (($opt_L && !$rule->{lang}) \|\|
		105	($rule->{lang} &&
		106	(!$ok_lang \|\| $rule->{lang} !~ /^$ok_lang/i))); # Wrong language
		107
		108	return 0 if ($rule->{tflags} =~ /\bnet\b/);
		109
		110	return 1;
111
112	};
113
114	$opt_l \|\|= "masses.log";
115
116	if (!$opt_c \|\| !scalar(@$opt_c)) {
117	# Try to read this in from the log, if possible
118	open (IN, $opt_l) or die "Can't open $opt_l: $!";
119	my $files = 0; # are we in the files section?
120	while(<IN>) {
121	if (!$files) {
122	if (/^\# SVN revision:/) {
123	$opt_c = [ "$FindBin::Bin/../rules" ];
124	last;
125	} elsif (/^\# Using configuration:$/) {
126	$files = 1;
127	}
128	} elsif (/^\#\s+(.)\s$/) {
129	push (@$opt_c, $1);
130	} else {
131	# All done!
132	last;
133	}
134	}
135
136	foreach my $file (@$opt_c) {
137	die "Can't read $file" unless -r $file;
138	}
39	}	139	}
40		140
41	usage() if($opt_h \|\| ($opt_l && $opt_L));	141	my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
		142	scoreset => $opt_s,
		143	greprules => $greprules,
		144	logfile => $opt_l,
		145	nologs => 1});
42		146
43	$lower = 1;	147	$masses->readrules();
44	#$threshold = 5;	148	$masses->readlogs();
45	$higher = 9;
46	$min_expected = 2; # Should not be set to more than 5 or less than 2
47		149
48	my %freq_spam = (); # how often non-nice found in spam	150	my $rules = $masses->get_rules_hash();
		151	my $logs = $masses->get_logs();
		152
		153	my $num_spam = $masses->get_num_spam();
		154	my $num_ham = $masses->get_num_ham();
		155
49	my %freq_over_higher_falsepos = (); # how often non-nice found in ones over	156	my %freq_over_higher_falsepos = (); # how often non-nice found in ones over
50	# higher threshold that are false positives	157	# higher threshold that are false positives
51	my %freq_nonspam = (); # how often nice found in nonspam
52	my %freq_under_lower_falseneg = (); # how often nice found in ones under	158	my %freq_under_lower_falseneg = (); # how often nice found in ones under
53	# lower threshold that are false negatives	159	# lower threshold that are false negatives
54		160
Lines 59-101 Link Here
59	my %ratio_expected_falsepos = (); # ratio version of above	165	my %ratio_expected_falsepos = (); # ratio version of above
60	my %ratio_expected_falseneg = (); # ditto	166	my %ratio_expected_falseneg = (); # ditto
61		167
62	my $num_spam = 0;
63	my $num_nonspam = 0;
64	my $num_over_higher_falsepos = 0;	168	my $num_over_higher_falsepos = 0;
65	my $num_under_lower_falseneg = 0;	169	my $num_under_lower_falseneg = 0;
66	my $ok_lang = '';
67		170
68	readscores();	171	my %chisquare = ( );
		172	my %prob = ( );
69		173
70	$ok_lang = lc ($opt_l \|\| $opt_L \|\| '');
71	if ($ok_lang eq 'all') { $ok_lang = '.'; }
72		174
73	foreach my $key (keys %rules) {	175	foreach my $key (keys %$rules) {
74		176
75	if ( ($opt_L && !$rules{$key}->{lang}) \|\|	177	if ($rules->{$key}->{tflags} !~ /\buserconf\b/) {
76	($rules{$key}->{lang} &&	178	if ($rules->{$key}->{tflags} =~ m/nice/) {
77	(!$ok_lang \|\| $rules{$key}->{lang} !~ /^$ok_lang/i)
78	) ) {
79	delete $rules{$key} ; next;
80	}
81
82	if ($rules{$key}->{tflags} =~ m/net/) {
83	delete $rules{$key};
84	next;
85	}
86	if ($rules{$key}->{tflags} !~ m/userconf/) {
87	if ($rules{$key}->{tflags} =~ m/nice/) {
88	$freq_nonspam{$key} = 0;
89	$freq_under_lower_falseneg{$key} = 0;	179	$freq_under_lower_falseneg{$key} = 0;
90	} else {	180	} else {
91	$freq_spam{$key} = 0;
92	$freq_over_higher_falsepos{$key} = 0;	181	$freq_over_higher_falsepos{$key} = 0;
93	}	182	}
94	}	183	}
		184
95	}	185	}
96		186
97	readlogs();	187	foreach my $log (@$logs) {
98		188
		189	if($log->{isspam}) {
		190	# Also need to count plus_hits
		191	my $plus_hits = 0;
		192	foreach my $test (@{$log->{tests_hit}}) {
		193	$plus_hits += $test->{score} if ($test->{score} > 0);
		194	}
		195
		196	if(($log->{score} <= $lower) && $plus_hits && $plus_hits >= $lower) {
		197	$num_under_lower_falseneg++;
		198	foreach my $test (@{$log->{tests_hit}}) {
		199	$num_under_lower_falseneg++;
		200	$freq_under_lower_falseneg{$test->{name}}++ if exists $freq_under_lower_falseneg{$test->{name}};
		201	}
		202	}
		203	}
		204	else {
		205	if($log->{score} > $higher) {
		206	$num_over_higher_falsepos++;
		207	foreach my $test (@{$log->{tests_hit}}) {
		208	$num_over_higher_falsepos++;
		209	$freq_over_higher_falsepos{$test->{name}}++ if exists $freq_over_higher_falsepos{$test->{name}};
		210	}
		211	}
		212	}
		213
		214	}
		215
99	unless (($num_over_higher_falsepos >= $min_expected)	216	unless (($num_over_higher_falsepos >= $min_expected)
100	&& ($num_under_lower_falseneg >= $min_expected)) {	217	&& ($num_under_lower_falseneg >= $min_expected)) {
101	die "Insufficient extremes in dataset (" . $num_over_higher_falsepos .	218	die "Insufficient extremes in dataset (" . $num_over_higher_falsepos .
Lines 119-130 Link Here
119	}	236	}
120		237
121	my $ratio_falsepos = $num_over_higher_falsepos/$num_spam;	238	my $ratio_falsepos = $num_over_higher_falsepos/$num_spam;
122	my $ratio_falseneg = $num_under_lower_falseneg/$num_nonspam;	239	my $ratio_falseneg = $num_under_lower_falseneg/$num_ham;
123		240
124	my $skipped_non_nice = 0;	241	my $skipped_non_nice = 0;
125		242
126	foreach $rule (keys %freq_spam) {	243	# non-nice rules
127	my $expected = $freq_spam{$rule}*$ratio_falsepos;	244	foreach my $rule (keys %freq_over_higher_falsepos) {
		245	my $expected = $rules->{$rule}->{freq_spam}*$ratio_falsepos;
128	if ($expected <= $min_expected) {	246	if ($expected <= $min_expected) {
129	$skipped_non_nice++;	247	$skipped_non_nice++;
130	next;	248	next;
Lines 136-142 Link Here
136	$freq_over_higher_falsepos{$rule}/$expected;	254	$freq_over_higher_falsepos{$rule}/$expected;
137	($chisquare{$rule},$prob{$rule}) =	255	($chisquare{$rule},$prob{$rule}) =
138	chisquare($num_spam,$num_over_higher_falsepos,	256	chisquare($num_spam,$num_over_higher_falsepos,
139	$freq_spam{$rule},$freq_over_higher_falsepos{$rule});	257	$rules->{$rule}->{freq_spam},$freq_over_higher_falsepos{$rule});
140	if ($freq_over_higher_falsepos{$rule} < $expected) {	258	if ($freq_over_higher_falsepos{$rule} < $expected) {
141	$chisquare{$rule} *= -1;	259	$chisquare{$rule} *= -1;
142	}	260	}
Lines 146-153 Link Here
146		264
147	my $skipped_nice = 0;	265	my $skipped_nice = 0;
148		266
149	foreach $rule (keys %freq_nonspam) {	267	# nice rules
150	my $expected = $freq_nonspam{$rule}*$ratio_falseneg;	268	foreach my $rule (keys %freq_under_lower_falseneg) {
		269	my $expected = $rules->{$rule}->{freq_ham}*$ratio_falseneg;
151	if ($expected <= $min_expected) {	270	if ($expected <= $min_expected) {
152	$skipped_nice++;	271	$skipped_nice++;
153	next;	272	next;
Lines 158-165 Link Here
158	$ratio_expected_falseneg{$rule} =	277	$ratio_expected_falseneg{$rule} =
159	$freq_under_lower_falseneg{$rule}/$expected;	278	$freq_under_lower_falseneg{$rule}/$expected;
160	($chisquare{$rule},$prob{$rule}) =	279	($chisquare{$rule},$prob{$rule}) =
161	chisquare($num_nonspam,$num_under_lower_falseneg,	280	chisquare($num_ham,$num_under_lower_falseneg,
162	$freq_nonspam{$rule},$freq_under_lower_falseneg{$rule});	281	$rules->{$rule}->{freq_ham},$freq_under_lower_falseneg{$rule});
163	if ($freq_under_lower_falseneg{$rule} < $expected) {	282	if ($freq_under_lower_falseneg{$rule} < $expected) {
164	$chisquare{$rule} *= -1;	283	$chisquare{$rule} *= -1;
165	}	284	}
Lines 167-174 Link Here
167		286
168	warn "Skipped nice: $skipped_nice\n";	287	warn "Skipped nice: $skipped_nice\n";
169		288
170	@rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos);	289	# The rest is copied verbatim from before - its complicated and not
		290	# commented and should work unchanged except for the freq_spam and
		291	# freq_ham stuff and fixing some use strict stuff
171		292
		293	my @rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos);
		294
172	if (scalar(@rules_falsepos)) {	295	if (scalar(@rules_falsepos)) {
173	print "RULE\t\tCHISQUARE\tRATIO_FALSEPOS\tOVER_FALSEPOS\tFREQ_OVER ($num_over_higher_falsepos)\n";	296	print "RULE\t\tCHISQUARE\tRATIO_FALSEPOS\tOVER_FALSEPOS\tFREQ_OVER ($num_over_higher_falsepos)\n";
174	my(@rules_falsepos_bad) =	297	my(@rules_falsepos_bad) =
Lines 183-189 Link Here
183	$over_expected_falsepos{$a}) \|\|	306	$over_expected_falsepos{$a}) \|\|
184	($freq_over_higher_falsepos{$b} <=>	307	($freq_over_higher_falsepos{$b} <=>
185	$freq_over_higher_falsepos{$a})} (@rules_falsepos_bad);	308	$freq_over_higher_falsepos{$a})} (@rules_falsepos_bad);
186	foreach $rule (@rules_falsepos_bad) {	309	foreach my $rule (@rules_falsepos_bad) {
187	print $rule . "\t" . $prob{$rule} . "\t" .	310	print $rule . "\t" . $prob{$rule} . "\t" .
188	$ratio_expected_falsepos{$rule} . "\t" .	311	$ratio_expected_falsepos{$rule} . "\t" .
189	$over_expected_falsepos{$rule} . "\t" .	312	$over_expected_falsepos{$rule} . "\t" .
Lines 199-207 Link Here
199	($chisquare{$a} <=> $chisquare{$b}) \|\|	322	($chisquare{$a} <=> $chisquare{$b}) \|\|
200	($ratio_expected_falsepos{$a} <=>	323	($ratio_expected_falsepos{$a} <=>
201	$ratio_expected_falsepos{$b}) \|\|	324	$ratio_expected_falsepos{$b}) \|\|
202	($freq_spam{$b} <=>	325	($rules->{$b}->{freq_spam} <=>
203	$freq_spam{$a})} (@rules_falsepos_good);	326	$rules->{$a}->{freq_spam})} (@rules_falsepos_good);
204	foreach $rule (@rules_falsepos_good) {	327	foreach my $rule (@rules_falsepos_good) {
205	print $rule . "\t" . $prob{$rule} . "\t" .	328	print $rule . "\t" . $prob{$rule} . "\t" .
206	$ratio_expected_falsepos{$rule} . "\t" .	329	$ratio_expected_falsepos{$rule} . "\t" .
207	$over_expected_falsepos{$rule} . "\t" .	330	$over_expected_falsepos{$rule} . "\t" .
Lines 212-218 Link Here
212	warn "No over-falsepos to print\n";	335	warn "No over-falsepos to print\n";
213	}	336	}
214		337
215	@rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg);	338	my @rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg);
216		339
217	if (scalar(@rules_falseneg)) {	340	if (scalar(@rules_falseneg)) {
218	print "RULE\t\tCHISQUARE\tRATIO_FALSENEG\tOVER_FALSENEG\tFREQ_UNDER ($num_under_lower_falseneg)\n";	341	print "RULE\t\tCHISQUARE\tRATIO_FALSENEG\tOVER_FALSENEG\tFREQ_UNDER ($num_under_lower_falseneg)\n";
Lines 228-234 Link Here
228	$over_expected_falseneg{$a}) \|\|	351	$over_expected_falseneg{$a}) \|\|
229	($freq_under_lower_falseneg{$b} <=>	352	($freq_under_lower_falseneg{$b} <=>
230	$freq_under_lower_falseneg{$a})} (@rules_falseneg_bad);	353	$freq_under_lower_falseneg{$a})} (@rules_falseneg_bad);
231	foreach $rule (@rules_falseneg_bad) {	354	foreach my $rule (@rules_falseneg_bad) {
232	print $rule . "\t" . $prob{$rule} . "\t" .	355	print $rule . "\t" . $prob{$rule} . "\t" .
233	$ratio_expected_falseneg{$rule} . "\t" .	356	$ratio_expected_falseneg{$rule} . "\t" .
234	$over_expected_falseneg{$rule} . "\t" .	357	$over_expected_falseneg{$rule} . "\t" .
Lines 244-252 Link Here
244	($chisquare{$a} <=> $chisquare{$b}) \|\|	367	($chisquare{$a} <=> $chisquare{$b}) \|\|
245	($ratio_expected_falseneg{$a} <=>	368	($ratio_expected_falseneg{$a} <=>
246	$ratio_expected_falseneg{$b}) \|\|	369	$ratio_expected_falseneg{$b}) \|\|
247	($freq_spam{$b} <=>	370	($rules->{$b}->{freq_ham} <=>
248	$freq_spam{$a})} (@rules_falseneg_good);	371	$rules->{$a}->{freq_ham})} (@rules_falseneg_good);
249	foreach $rule (@rules_falseneg_good) {	372	foreach my $rule (@rules_falseneg_good) {
250	print $rule . "\t" . $prob{$rule} . "\t" .	373	print $rule . "\t" . $prob{$rule} . "\t" .
251	$ratio_expected_falseneg{$rule} . "\t" .	374	$ratio_expected_falseneg{$rule} . "\t" .
252	$over_expected_falseneg{$rule} . "\t" .	375	$over_expected_falseneg{$rule} . "\t" .
Lines 258-354 Link Here
258	}	381	}
259		382
260	exit;	383	exit;
261
262	sub readlogs {
263	my $spam = $ARGV[0] \|\| "spam.log";
264	my $nonspam = $ARGV[1] \|\| (-f "good.log" ? "good.log" : "nonspam.log");
265
266
267	(open(NONSPAM,$nonspam)) \|\|
268	(die "Couldn't open file '$nonspam': $!; stopped");
269
270	while (defined($line = <NONSPAM>)) {
271	if ($line =~ m/^\s*\#/) {
272	next;
273	} elsif ($line =~ m/^.\s+-?\d+\s+\S+\s(\S)/) {
274	my $tests = $1;
275	my $hits = 0;
276	my(@tests) = ();
277	foreach $test (grep {length($_)} (split(/,+/,$tests))) {
278	if (exists($rules{$test})) {
279	push @tests, $test;
280	$hits += $rules{$test}->{score};
281	}
282	}
283
284	if (scalar(@tests)) {
285	$num_nonspam++;
286	foreach $test (grep {exists($freq_nonspam{$_})} (@tests)) {
287	$freq_nonspam{$test}++;
288	}
289	if ($hits >= $higher) {
290	$num_over_higher_falsepos++;
291	foreach $test (grep
292	{exists($freq_over_higher_falsepos{$_})} (@tests)) {
293	$freq_over_higher_falsepos{$test}++;
294	}
295	}
296	}
297	} elsif ($line =~ m/\S/) {
298	chomp($line);
299	warn "Can't interpret line '$line'; skipping";
300	}
301	}
302
303	close(NONSPAM);
304
305	(open(SPAM,$spam)) \|\| (die "Couldn't open file '$spam': $!; stopped");
306
307	while (defined($line = <SPAM>)) {
308	if ($line =~ m/^\s*\#/) {
309	next;
310	} elsif ($line =~ m/^.\s+-?\d+\s+\S+\s(\S)/) {
311	my $tests = $1;
312	my $hits = 0;
313	my $plus_hits = 0;
314	my(@tests) = ();
315	foreach $test (grep {length($_)} (split(/,+/,$tests))) {
316	if (exists($rules{$test})) {
317	push @tests, $test;
318	$hits += $rules{$test}->{score};
319	if ($rules{$test}->{score} > 0) {
320	$plus_hits += $rules{$test}->{score};
321	}
322	}
323	}
324
325	if (scalar(@tests)) {
326	$num_spam++;
327	foreach $test (grep {exists($freq_spam{$_})} (@tests)) {
328	$freq_spam{$test}++;
329	}
330	if (($hits <= $lower) && $plus_hits &&
331	($plus_hits >= $lower)) {
332	$num_under_lower_falseneg++;
333	foreach $test (grep
334	{exists($freq_under_lower_falseneg{$_})} (@tests)) {
335	$freq_under_lower_falseneg{$test}++;
336	}
337	}
338	}
339	} elsif ($line =~ m/\S/) {
340	chomp($line);
341	warn "Can't interpret line '$line'; skipping";
342	}
343	}
344
345	close(SPAM);
346	}
347
348
349	sub readscores {
350	system ("./parse-rules-for-masses") and
351	die "Couldn't do parse-rules-for-masses: $?; stopped";
352	require "./tmp/rules.pl";
353	}
354

Lines 6-17 Link Here

(-)masses/tenpass/10pass-compute-tcr (-4 / +4 lines)
6	do	6	do
7	mkdir tmp/10passrules > /dev/null 2>&1	7	mkdir tmp/10passrules > /dev/null 2>&1
8	cp ../rules/[0-9]*.cf tmp/10passrules	8	cp ../rules/[0-9]*.cf tmp/10passrules
9	./rewrite-cf-with-new-scores $SCORESET ../rules/50_scores.cf \	9	./rewrite-cf-with-new-scores -s $SCORESET --old=../rules/50_scores.cf \
10	tenpass_results/scores.$run > tmp/10passrules/50_scores.cf	10	--new=tenpass_results/scores.$run --out=tmp/10passrules/50_scores.cf \
		11	--cffile=../rules
11		12
12	./fp-fn-statistics --cffile=tmp/10passrules \	13	./fp-fn-statistics --cffile=tmp/10passrules \
13	--spam=tenpass_results/spam.log.$run \	14	--logfile=tenpass_results/masses.log.$run > tmp/stats
14	--nonspam=tenpass_results/ham.log.$run > tmp/stats
15		15
16	grep TCR: tmp/stats	16	grep TCR: tmp/stats
17	done	17	done

Lines 1-13 Link Here

(-)masses/tenpass/10pass-run (-17 / +13 lines)
1	#!/bin/sh	1	#!/bin/sh
2		2
3	# change these!	3	# change these!
4	NSBASE=ham-logs	4	BASE=logs/
5	SPBASE=spam-logs
6	SCORESET="0"
7		5
8	passes="1 2 3 4 5 6 7 8 9 10"	6	passes="1 2 3 4 5 6 7 8 9 10"
9	mkdir -p tenpass_results	7	mkdir tenpass_results
10	mkdir -p ORIG
11		8
12	> make.output	9	> make.output
13		10
Lines 17-44 Link Here
17	echo "Training for corpus $id..."	14	echo "Training for corpus $id..."
18	pwd; date	15	pwd; date
19		16
20	> ORIG/ham-set$SCORESET.log	17	> masses.log
21	> ORIG/spam-set$SCORESET.log
22
23	echo -n "(using corpora blocks: "	18	echo -n "(using corpora blocks: "
24	for notid in $passes ; do	19	for notid in $passes ; do
25	if [ "$notid" != "$id" ] ; then	20	if [ "$notid" != "$id" ] ; then
26	echo -n "$notid "	21	echo -n "$notid "
27	cat $NSBASE/split-$notid.log >> ORIG/ham-set$SCORESET.log	22	cat $BASE/split-$notid.log >> masses.log
28	cat $SPBASE/split-$notid.log >> ORIG/spam-set$SCORESET.log
29	fi	23	fi
30	done	24	done
31	echo "for training)"	25	echo "for training)"
32		26
33	make clean >> make.output	27	make clean >> make.output
34	make >> make.output 2>&1	28	make perceptron 2>&1 >> make.output
35	./runGA	29	./perceptron
36	pwd	30	pwd; date
37	date
38		31
39	echo "Saving test data for corpus $id..."	32	echo "Saving test data for corpus $id..."
40		33
41	cp $NSBASE/split-$id.log tenpass_results/ham.log.$id	34	cp $BASE/split-$id.log tenpass_results/masses.log.$id
42	cp $SPBASE/split-$id.log tenpass_results/spam.log.$id	35
43	cp gen-set$SCORESET.scores tenpass_results/scores.$id	36	cp perceptron.scores tenpass_results/scores.$id
		37
44	done	38	done
		39
		40