Attachment #1960 for bug #2853

View | Details | Raw Unified | Return to bug 2853
Collapse All | Expand All




# <@LICENSE>
# Copyright 2004 Apache Software Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

=head1 NAME

Mail::SpamAssassin::Masses - Interface for reading and parsing rules
and mass-check logs for SpamAssassin

=head1 SYNOPSIS

  my $parser = Mail::SpamAssassin::Masses->new();
  my $rules = $parser->readrules();
  my $logs = $parser->readlogs();

  foreach my $test (keys %$rules) {
    if ($rules->{$test}->{score} > 1) {
      ...
    }

=head1 DESCRIPTION

Mail::SpamAssassin::Masses is a module to simplify the many scripts
that used to make up the SpamAssassin re-scoring process. By
consolidating all the shared code in one module, the scripts can be
simplified and require fewer temporary files.

=head1 METHODS

=over 4

=cut

package Mail::SpamAssassin::Masses;

use strict;
use warnings;
use Carp;

=item $parser = Mail::SpamAssassin::Masses->new( [ { opt => val, ... } ] );

Construct a new Mail::SpamAssassin::Masses object. You may pass the
following attribute-value pairs to the constructor.

=over 4

=item rulesdir

The directory containing rules. If multiple directories are desired,
an anonymous array should be passed.

=item scoreset

Scoreset to deal with.

=item logfile

Filename of mass-check log.

=item falses

Also count frequencies for false positives and false negatives from
the logs.

=item falsesonly

Only count false positives and false negatives.

=item greprule

Coderef that is passed a rule name and a hash ref with the entries
containing info about the rule. If the sub returns false, it is skipped.

=item greplog

Coderef that is passed a raw log entry. If it returns false, the entry
is skipped.

=item sliding_window

Use a sliding window for score ranges rather than a shrinking window.

=item nologs

Save memory by not saving the individual log results, just the
aggregate totals

=back

=cut

sub new {

  my $class = shift;
  $class = ref($class) || $class;

  my $self = shift;
  if (!defined $self){
    $self = { };
  }

  $self->{scoreset} ||= 0;
  $self->{rulesdir} ||= '';
  $self->{logfile} ||= "masses.log";

  bless($self, $class);

  return $self;

}

=item $parser->readrules()

Read and parse the rules from the directory specified as
C<rulesdir>. This loads the following keys and values into the hash
entry representing the rules (see below).

=over 4

=item name

Contains the rule's name.

=item score

Contains the rule's score.

=item type

Contains the rule's type (header, body, uri, etc.)

=item tflags

Contains the rules tflags (nice, autolearn, etc.) as specified in the config file.

=item lang

Set to the value of C<lang> for language-specific tests.

=item issubrule

Set to true if the rules is a sub-rule, (i.e. it starts with
__). Otherwise, undefined.

=item isnice

This key exists and is true if the rule is nice (i.e. with a score
that can be below zero).

=item describe

Set to the rule's description, in English, or in the rule's language.

=back

There may be more values once C<readlogs()> is run.

=cut


sub readrules {

  my $self = shift;

  $self->{rules} ||= { };
  my $rules = $self->{rules}; # $rules is a reference to the anon hash

  my @dirs = ref($self->{rulesdir}) ? @{$self->{rulesdir}} : $self->{rulesdir};

  foreach my $indir (@dirs) {
    my @files = glob("$indir/*.cf"); # no reason to only do numbered files

    foreach my $file (@files) {
      open (IN, "<$file") || croak("Can't open $file, $!");
      while(<IN>) {
        s/#.*$//g;
        s/^\s+//;
        s/\s+$//;
        next if /^$/;

        my $lang = '';
        if (s/^lang\s+(\S+)\s+//) {
          $lang = lc $1;
        }

        if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) {
          my $type = $1;
          my $name = $2;

          $rules->{$name} ||= { };
	  $rules->{$name}->{name} = $name;
          $rules->{$name}->{type} = $type;
          $rules->{$name}->{lang} = $lang if $lang;
          $rules->{$name}->{tflags} = '';

          if ($name =~ /^__/) {
	    $rules->{$name}->{issubrule} = '1';
	  }

        } elsif (/^describe\s+(\S+)\s+(.+)$/) {

          # Let's get description in english, por favor -- unless the rule isn't english

	  next if ($lang && (!$rules->{$1}->{lang} || $rules->{$1}->{lang} ne $lang));

          $rules->{$1} ||= { };
          $rules->{$1}->{describe} = $2;

        } elsif (/^tflags\s+(\S+)\s+(.+)$/) {
	  my $name = $1;
          $rules->{$name} ||= { };
          $rules->{$name}->{tflags} = $2;
	  if ($2 =~ /nice/) {
	    $rules->{$name}->{isnice} = 1;
	  }
        } elsif (/^score\s+(\S+)\s+(.+)$/) {
          my($name,$score) = ($1,$2);
          $rules->{$name} ||= { };
          if ( $score =~ /\s/ ) { # there are multiple scores
            ($score) = (split(/\s+/,$score))[$self->{scoreset}];
          }
          $rules->{$name}->{score} = $score;
        }
      }
      close IN;
    }
  }
  foreach my $rule (keys %{$rules}) {
    if (!defined $rules->{$rule}->{type}) {
      delete $rules->{$rule};   # no rule definition -> no rule
      next;
    }

    if (!defined $rules->{$rule}->{score}) {
      my $def = 1.0;
      if ($rule =~ /^T_/) { $def = 0.01; }

      if ($rules->{$rule}->{isnice}) {
        $rules->{$rule}->{score} = -$def;
      } else {
        $rules->{$rule}->{score} = $def;
      }
    }

    if ($self->{greprules} && !&{$self->{greprules}}($rule, $rules->{$rule}))
    {
      delete $rules->{$rule};
      next;
    }

  }

  $self->{_readrules} = 1;
}

=item $parser->readlogs()

Read and parse logs from C<logsdir>. This will create the anonymous
array of hashes referred to by C<$parser->{logs}>, with the following
keys:

=over 4

=item isspam

True if the message is spam. False or undefined otherwise.

=item isfalse

True if the message was a false negative or positive.

=item tests_hit

Array reference containing references to the hash representing each
rule hit.

=item score

Score the message received (under current scores).

=back

In addition, this method adds the following keys to the rule
information in C<$parser->{rules}>.

=over 4

=item freq_spam

Frequency hit in spam.

=item freq_ham

Frequency hit in ham.

=item freq_fp

Frequency in false positives.

=item freq_fn

Frequency in false negatives.

=back

Also, sets C<$parser->{num_spam}> and C<$parser->{num_ham}> to the number of
spam logs read and the number of ham logs read, respectively.

=cut

sub readlogs {

  my $self = shift;

  if (!$self->{_readrules}) {
    # need to read scores first!
    $self->readrules();
  }

  my $rules = $self->{rules}; # copy the ref, shorthand

  my $logs;
  if (! $self->{nologs}) {
    $self->{logs} ||= [ ];
    $logs = $self->{logs};
  }


  my ($num_spam, $num_ham, $count, $num_fp, $num_fn);
  $num_spam = $num_ham = $count = $num_fp = $num_fn = 0;

  # First, initialize stuff
  foreach my $rule (values %{$self->{rules}}) {
    $rule->{freq_spam} ||= 0;
    $rule->{freq_ham} ||= 0;

    if($self->{falses}) {
      $rule->{freq_fp} ||= 0;
      $rule->{freq_fn} ||= 0;
    }

  }

  my $file = $self->{logfile};
  open (IN, "<$file");

  while (<IN>) {
    next if /^\#/;
    next if /^$/;
    if($_ !~ /^(.)\s+(.)\s+-?[\d.]+\s+\S+(\s+\S+\s+)/) { warn "bad line: $_"; next; }

    if ($self->{greplogs} && !&{$self->{greplogs}}($_)) {
      next;
    }

    my $manual = $1;
    my $result = $2;
    $_ = $3;
    s/(?:bayes|time)=\S+//;
    s/,,+/,/g;
    s/^\s+//;
    s/\s+$//;


    if ($manual ne $result) {
      $self->{isfalse} = 1;
    }
    elsif ($self->{falsesonly}) {
      next;
    }

    if ($manual eq "s") {
      $num_spam++;
      $logs->[$count]->{isspam} = 1 unless $self->{nologs};
      $num_fn++ if $result eq "h";
    } else {
      $num_ham++;
      $num_fp++ if $result eq "s";
    }

    my @tests = ();
    my $score = 0;
    foreach my $tst (split (/,/, $_)) {
      next if ($tst eq '');

      # Don't count non-existant rules
      # (Could happen with greprules)
      next if ( !$rules->{$tst} || !$rules->{$tst}->{type} );

      if ($manual eq "s") {
	  $rules->{$tst}->{freq_spam}++;
	  $rules->{$tst}->{freq_fn}++ if ($self->{falses} && $result eq "h");
      }
      else {
	  $rules->{$tst}->{freq_ham}++;
	  $rules->{$tst}->{freq_fp}++ if ($self->{falses} && $result eq "s");
      }

      $score += $rules->{$tst}->{score};

      push (@tests, $rules->{$tst}) unless $self->{nologs};
    }

    $logs->[$count]->{tests_hit} = \@tests unless $self->{nologs};
    $logs->[$count]->{score} = $score;

    $count++;
  }
  close IN;

  $self->{num_spam} = $num_spam;
  $self->{num_ham} = $num_ham;
  if ($self->{falses}) {
    $self->{num_fn} = $num_fn;
    $self->{num_fp} = $num_fp;
  }

  $self->{_readlogs} = 1; # Done reading logs

}

=item $parser->do_statistics();

Calculate the S/O ratio and the rank for each test.

This adds the following keys to the rules hashes.

=over 4

=item spam_percent

Percentage of spam messages hit.

=item ham_percent

Percentage of ham messages hit.

=item soratio

S/O ratio -- percentage of spam messages hit divided by total
percentage of messages hit.

=back

=cut

sub do_statistics {
  my $self = shift;

  if (! $self->{_readlogs} ) {
    $self->readlogs();
  }

  my $rank_hi=0;
  my $rank_lo=999999;

  foreach my $rule (values %{$self->{rules}}) {

    if (!$rule->{freq_spam}) {
      $rule->{spam_percent} = 0;
    } else {
      $rule->{spam_percent} = $rule->{freq_spam} / $self->{num_spam} * 100.0;
    }

    if (!$rule->{freq_ham}) {
      $rule->{ham_percent} = 0;
    } else {
      $rule->{ham_percent} = $rule->{freq_ham} / $self->{num_ham} * 100.0;
    }

    if (!$rule->{freq_spam} && !$rule->{freq_ham}) {
      $rule->{soratio} = 0.5;
      next;
    }

    $rule->{soratio} = $rule->{spam_percent} / ($rule->{spam_percent} + $rule->{ham_percent});

  }

  $self->{_statistics} = 1;

}

=item $parser->do_rank();

Calculates the ranking for each rule and stores this in the
appropriate key.

=over 4

=item rank

"Rank" of the rule. High numbers are good, low are bad.

=back

=cut

sub do_rank {

  my $self = shift;

  if (! $self->{_statistics} ) {
    $self->do_statistics();
  }

  my $rank_hi = 0;
  my $rank_lo = 9999999;

  my %unwanted;
  my %wanted;
  my %wranks;
  my %uranks;
  my $rules = $self->{rules};


  foreach my $rule (values %{$self->{rules}}) {

    $wanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_ham} : $rule->{freq_spam};
    $unwanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_spam} : $rule->{freq_ham};

    $wranks{$wanted{$rule->{name}}} = 1;
    $uranks{$unwanted{$rule->{name}}} = 1;

  }

  my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
  my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;

  # first half of ranking is the wanted rank
  my $position = 0;
  my $last = undef;

  foreach my $test (@wanted) {
    $position++ if defined $last && $last != $wanted{$test};
    $rules->{$test}->{rank} += $position;
    $last = $wanted{$test};
  }

  # second half is the unwanted rank
  $position = 0;
  $last = undef;
  my $normalize = (scalar keys %wranks) / (scalar keys %uranks);

  foreach my $test (@unwanted) {
    $position++ if defined $last && $last != $unwanted{$test};
    $rules->{$test}->{rank} += ($position * $normalize);
    $last = $unwanted{$test};
    $rank_hi = $rules->{$test}->{rank} if ($rules->{$test}->{rank} > $rank_hi);
    $rank_lo = $rules->{$test}->{rank} if ($rules->{$test}->{rank} < $rank_lo);
  }

  $rank_hi = $rank_hi - $rank_lo;
  foreach my $rule (values %{$rules}) {
    $rule->{rank} = ($rank_hi == 0) ? 0.001 : (($rule->{rank} - $rank_lo)/ $rank_hi);
  }

  $self->{_rank} = 1;
}

=item $parser->get_rules_array();

Returns a reference to an array of hash references. The values of
these hash have keys as listed above.

=cut

sub get_rules_array {
  my $self = shift;
  return [ values %{$self->{rules}} ];
}

=item $parser->get_rules_hash();

Returns a reference to a hash with rule names as keys and hash
references as values. The values of these hash have keys as listed
above.

=cut

sub get_rules_hash {
  my $self = shift;
  return $self->{rules};
}

=item $parser->get_logs();

Returns a reference to the array containing log entries, in the form
of anonymous hashes with keys as described above.

=cut

sub get_logs {
  my $self = shift;
  return $self->{logs};
}

=item $parser->get_num_ham();

Returns number of ham logs read.

=cut

sub get_num_ham {
  my $self = shift;
  return $self->{num_ham};
}

=item $parser->get_num_spam();

Returns number of spam logs read.

=cut

sub get_num_spam {
  my $self = shift;
  return $self->{num_spam};
}

=item $parser->do_score_ranges();

Figure out range in which score can be set based on the soratio, etc.

This is necessary so that the perceptron doesn't set silly
scores. (This may not be as much of a problem as it was with the old
GA.)

This adds the following keys to the rules hashes:

=over 4

=item ismutable

Determines whether the perceptron can select a score for this test.

=item range_lo

Determines the lowest score the perceptron can set.

=item range_hi

Determines the highest score the perceptron can set.

=cut

sub do_score_ranges() {

  my $self = shift;

  if ( !$self->{_statistics} ) {
    $self->do_statistics();
  }
  if ( !$self->{_rank} ) {
    $self->do_rank();
  }

  foreach my $rule (values %{$self->{rules}}) {

    my ($rank, $lo, $hi);

    $rank = $rule->{rank};

    # Get rid of rules that don't hit -- and disable completely.
    if ($rule->{spam_percent} + $rule->{ham_percent} < 0.01 ||
	$rule->{score} == 0) {

      $rule->{ismutable} = 0;
      $rule->{range_lo} = $rule->{range_hi} = 0;
      next;

    }

    # next: get rid of tests that don't apply in this scoreset
    # or are userconf -- set ismutable to 0, but keep the score
    if ($rule->{tflags} =~ /\buserconf\b/ ||
	(($self->{scoreset} % 2) == 0 && $rule->{tflags} =~/\bnet\b/)) {

      $rule->{ismutable} = 0;
      $rule->{range_lo} = $rule->{range_hi} = $rule->{score};
      next;

    }


    # Normal rules:

    # This seems to convert from [-1,1] to [0,1] but we're already in
    # [0,1] space - Is this right?

    # The current way ranks are calculated, > 0.5 and < 0.5 have no
    # special meaning

#      # 0.0 = best nice, 1.0 = best nonnice
#      if ($rule->{isnice}) {
#        $rank = .5 - ($rank / 2);
#      } else {
#        $rank = .5 + ($rank / 2);
#      }

    # ranks are really meant to be used as human readable - not as
    # tools to choose score range - use soratio instead!

    if ($self->{use_sliding_window}) {
      ($lo, $hi) = $self->sliding_window_ratio_to_range($rule->{soratio});
    } else {
      ($lo, $hi) = $self->shrinking_window_ratio_to_range($rule->{soratio});
    }

    # Modify good rules to be lower
    if ($rule->{isnice}) {
      if ($rule->{tflags} =~ /\blearn\b/) { # learn rules should get
                                            # higher scores (-5.4)
	$lo *= 1.8;
      }
      elsif ( $rule->{soratio} <= 0.05 && $rule->{ham_percent} > 0.5) {
	$lo *= 1.5;
      }

      # argh, ugly... but i'm copying it whole...
      $hi =	($rule->{soratio} == 0) ? $lo :
    		($rule->{soratio} <= 0.005 ) ? $lo/1.1 :
    		($rule->{soratio} <= 0.010 && $rule->{ham_percent} > 0.2) ? $lo/2.0 :
		($rule->{soratio} <= 0.025 && $rule->{ham_percent} > 1.5) ? $lo/10.0 :
		0;

      if ($rule->{soratio} >= 0.35 ) {
	($lo, $hi) = (0,0);
      }
    }
    else { # Make non-nice rules have higher scores if they're good
      if ($rule->{tflags} =~ /\blearn\b/ ) {
	$hi *= 1.8;
      }
      elsif ( $rule->{soratio} >= 0.99 && $rule->{spam_percent} > 1.0) {
	$hi *= 1.5;
      }

      $lo =	($rule->{soratio} == 1) ? $hi:
    		($rule->{soratio} >= 0.995 ) ? $hi/4.0 :
    		($rule->{soratio} >= 0.990 && $rule->{spam_percent} > 1.0) ? $hi/8.0 :
		($rule->{soratio} >= 0.900 && $rule->{spam_percent} > 10.0) ? $hi/24.0 :
		0;

      if ($rule->{soratio} <= 0.65 ) { # auto-disable bad rules
	($lo, $hi) = (0,0);
      }
    }


    # Some sanity checking
    if($hi < $lo) {
      ($lo, $hi) = ($hi, $lo);
    }


    $rule->{ismutable} = ($lo == $hi) ? 0 : 1;
    $rule->{range_lo} = $lo;
    $rule->{range_hi} = $hi;

  }
}

sub sliding_window_ratio_to_range {
  my ($self, $ratio) = @_;

  # (rough) graphic demo of this algorithm:
  # 0.0  = -limit [......] 0 ........ limit
  # 0.25 = -limit ..[..... 0 .]...... limit
  # 0.5  = -limit ....[... 0 ...].... limit
  # 0.75 = -limit ......[. 0 .....].. limit
  # 1.0  = -limit ........ 0 [......] limit
  my $sliding_window_limits = 4.8; # limits = [-$range, +$range]
  my $sliding_window_size =   5.5; # scores have this range within limits

  my $lo = -$sliding_window_limits + ($sliding_window_size * $ratio);
  my $hi = +$sliding_window_limits - ($sliding_window_size * (1-$ratio));
  if ($lo > $hi) {
    ($lo,$hi) = ($hi,$lo);
  }
  ($lo, $hi);

}
sub shrinking_window_ratio_to_range {
  my ($self, $ratio) = @_;


  # 0.0  = -limit [......] 0 ........ limit
  # 0.25 = -limit ....[... 0 ]....... limit
  # 0.5  = -limit ......[. 0 .]...... limit (note: tighter)
  # 0.75 = -limit .......[ 0 ...].... limit
  # 1.0  = -limit ........ 0 [......] limit
  my $shrinking_window_lower_base =   0.00; 
  my $shrinking_window_lower_range =  1.00; # *ratio, added to above
  my $shrinking_window_size_base =    1.00;
  my $shrinking_window_size_range =   1.00; # *ratio, added to above

  my $is_nice = 0;
  my $adjusted = ($ratio -.5) * 2;      # adj [0,1] to [-1,1]
  if ($adjusted < 0) { $is_nice = 1; $adjusted = -$adjusted; }

#$adjusted /= 1.5 if ( $ratio < 0.95 && $ratio > 0.15 ); # tvd

  my $lower = $shrinking_window_lower_base 
                        + ($shrinking_window_lower_range * $adjusted);
  my $range = $shrinking_window_size_base 
                        + ($shrinking_window_size_range * $adjusted);
  my $lo = $lower;
  my $hi = $lower + $range;
  if ($is_nice) {
    my $tmp = $hi; $hi = -$lo; $lo = -$tmp;
  }
  if ($lo > $hi) {
    ($lo,$hi) = ($hi,$lo);
  }

  ($lo, $hi);
}


# Pacify perl
1;




# limitations under the License.
# </@LICENSE>


use FindBin;
use lib "$FindBin::Bin/../lib";
use Mail::SpamAssassin::Masses;
use Getopt::Long qw(:config bundling auto_help);
use Pod::Usage;
use strict;
use warnings;


use vars qw {
  $opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c
  $opt_a $opt_t $opt_s $opt_z $opt_inclang $opt_auto
};

GetOptions("c|cffile=s@" => \$opt_c,
	   "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
	   "l|logfile=s" => \$opt_l,
	   "f|falses" => \$opt_f,
	   "a|all" => \$opt_a,
	   "p|percentages" => \$opt_p,
	   "x|extended" => \$opt_x,
	   "m|matchrule=s" => \$opt_m, #,
	   "t|tflags=s" => \$opt_t,
	   "M|matchlog=s" => \$opt_M,
	   "X|excludelog=s" => \$opt_X,
	   "L|language=s" => \$opt_L,
	   "include-language=s" => \$opt_inclang);

    -c p   use p as the rules directory
    -f     falses. count only false-negative or false-positive matches
    -m RE  print rules matching regular expression
    -t RE  print rules with tflags matching regular expression
    -M RE  only consider log entries matching regular expression
    -X RE  don't consider log entries matching regular expression
    -l LC  also print language specific rules for lang code LC (or 'all')
    -L LC  only print language specific rules for lang code LC (or 'all')
    -a     display all tests
    -p     percentages. implies -x
    -x     extended output, with S/O ratio and scores
    -s SC  which scoreset to use
    -i     use IG (information gain) for ranking

=head1 NAME

hit-frequencies - Display statistics about tests hit by a mass-check run

=head1 SYNOPSIS
    are \"spam.log\" and \"ham.log\" in the cwd.

hit-frequencies [options]
}

 Options:
    -c,--cffile=path	  Use path as the rules directory
    -s,--scoreset=n	  Use scoreset n
    -l,--logfile=file	  Read in file instead of masses.log
    -f			  Count only false-positives/false-negatives
    -a			  Report all tests (including subrules)
    -p			  Report percentages instead of raw hits
    -x			  "Extended" output, include RANK, S/O and SCORE
    -m,--matchrule=re     Print rules matching the regular expression
    -t,--tflags=re	  Print only rules with tflags matching the regular expression
    -M,--matchlog=re      Consider only logs matching the regular expression
    -X,--excludelog=re	  Exclude logs matching this regular expression
    -L,--language=lc	  Only print language specific tests for specified lang code (try 'all')
    --include-language=lc Also print language specific tests for specified lang code (try 'all')

=head1 DESCRIPTION
  $opt_x = 1;
}

B<hit-frequencies> will read the mass-check log F<masses.log> or the
log given by the B<--logfile> option. The output will contain a
summary of the number of ham and spam messages and detailed statistics
for each rule. By default, B<hit-frequencies> will try to guess the
proper values for B<--cffile> based on the header of the
masses.log. The output will include the following columns:

=over 4

=item OVERALL
my %freq_ham = ();
my $num_spam = 0;
my $num_ham = 0;
my %ranking = ();
my $ok_lang = '';

Number of times (or percentage with B<-p>) the rule hit on
all messages (spam or ham).

=item SPAM
if ($ok_lang eq 'all') { $ok_lang = '.'; }

Number of times (or percentage with B<-p>) the rule hit on
spam messages.

=item HAM
       ($rules{$key}->{lang} &&
         (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i)
     ) ) {
    delete $rules{$key} ; next;
  }

Number of times (or percentage with B<-p>) the rule hit on
ham messages.
}

=item S/O

Shown only with B<-x> or B<-p>, this is the number of spam hits
divided by total number of hits (C<S/O> refers to spam divided by
overall).

=item RANK
  my $sorting = $opt_i ? "IG" : "RANK";
  if ($opt_f) {
    printf "%7s %7s %7s  %6s  %6s  %6s  %s\n",
  	"OVERALL%", "FNEG%", "FPOS%", "S/O", $sorting, "SCORE", "NAME";
  } else {
    printf "%7s %7s  %7s  %6s  %6s  %6s  %s\n",
  	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
  }
  printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
  	$hdr_all, $hdr_spam, $hdr_ham,
        soratio ($num_spam,$num_ham), 0, 0;

Shown only with B<-x> or B<-p>, this is a measure that attempts to
indicate how I<good> or I<useful> a test is. The higher it is, the
better the test.
  printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  (all messages as %%)\n",
  	$hdr_all, $hdr_spam, $hdr_ham,
        soratio ($num_spam,$num_ham), 0, 0;

=item SCORE
  printf "%7s %7s  %7s  %6s  %6s %6s  %s\n",
  	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
  printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  (all messages)\n",
  	$hdr_all, $hdr_spam, $hdr_ham,
        soratio ($num_spam,$num_ham), 0, 0;

Shown only with B<-x> or B<-p>, this is the current score assigned to
the rule.
  	"OVERALL", "SPAM", "HAM", "NAME";
  printf "%10d  %10d  %10d  (all messages)\n",
  	$hdr_all, $hdr_spam, $hdr_ham;
}

=item NAME
my @tests = ();
my $rank_hi = 0;
my $rank_lo = 9999999;

This is the rule's name.
my %wanted;
my %unwanted;
my %wranks;
my %uranks;

=back
  next unless (exists $rules{$test});           # only valid tests
  next if (!$opt_a && $rules{$test}->{issubrule});

=head1 BUGS
  push (@tests, $test);

Please report bugs to http://bugzilla.spamassassin.org/
  if ($rules{$test}->{tflags} =~ /nice/) { $isnice = 1; }

=head1 SEE ALSO
  my $fn = $freq_ham{$test}; $fn ||= 0;
  my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
  my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;

L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>

=cut
    $soratio = 1.0 - $soratio;
    my $tmp = $fsadj; $fsadj = $fnadj; $fnadj = $tmp;
  }

if ($opt_L && $opt_inclang) {
  pod2usage("-L/--language and --include-language are mutually exclusive");
    my $rank;

    # New new system: from "Learning to Filter Unsolicited Commercial E-Mail",
    # Ion Androutsopoulos et al: determine the information gain IG(X, C) of the
    # Boolean attributes (ie. the rules). Measures "the average reduction in
    # the entropy of C (classification) given the value of X (the rule)". Makes
    # a good ranking measure with a proper statistical basis. ;)
    #
    # Still would like to get an entropy measure in, too.
    #
    #             sum                                    P(X = x ^ C = c)
    # IG(X,C) = x in [0, 1]    P(X = x ^ C = c) . log2( ------------------- )
    #           c in [Ch, Cs]                           P(X = x) . P(C = c)
    #
    my $safe_nspam = $num_spam || 0.0000001;
    my $safe_nham = $num_ham || 0.0000001;

    my $num_all = ($num_spam + $num_ham);
    my $safe_all = $num_all || 0.0000001;
    my $f_all = $fs+$fn;

    my $px0 = (($num_all - $f_all) / $safe_all);         # P(X = 0)
    my $px1 = ($f_all / $safe_all);                      # P(X = 1)
    my $pccs = ($num_spam / $safe_all);                  # P(C = Cs)
    my $pcch = ($num_ham / $safe_all);                   # P(C = Ch)
    my $px1ccs = ($fs / $safe_nspam);                   # P(X = 1 ^ C = Cs)
    my $px1cch = ($fn / $safe_nham);                    # P(X = 1 ^ C = Ch)
    my $px0ccs = (($num_spam - $fs) / $safe_nspam);     # P(X = 0 ^ C = Cs)
    my $px0cch = (($num_ham - $fn) / $safe_nham);       # P(X = 0 ^ C = Ch)
    my $safe_px0_dot_pccs = ($px0 * $pccs) || 0.00000001;
    my $safe_px0_dot_pcch = ($px0 * $pcch) || 0.00000001;
    my $safe_px1_dot_pccs = ($px1 * $pccs) || 0.00000001;
    my $safe_px1_dot_pcch = ($px1 * $pcch) || 0.00000001;

    sub log2 { return log($_[0]) / 0.693147180559945; } # log(2) = 0.6931...

    my $safe_px0ccs = ($px0ccs || 0.0000001);
    my $safe_px0cch = ($px0cch || 0.0000001);
    my $safe_px1ccs = ($px1ccs || 0.0000001);
    my $safe_px1cch = ($px1cch || 0.0000001);
    $rank = ( $px0ccs * log2($safe_px0ccs / $safe_px0_dot_pccs) ) +
                    ( $px0cch * log2($safe_px0cch / $safe_px0_dot_pcch) ) +
                    ( $px1ccs * log2($safe_px1ccs / $safe_px1_dot_pccs) ) +
                    ( $px1cch * log2($safe_px1cch / $safe_px1_dot_pcch) );

    $ranking{$test} = $rank;
    $rank_hi = $rank if ($rank > $rank_hi);
    $rank_lo = $rank if ($rank < $rank_lo);
  }
  else {
    # basic wanted/unwanted ranking
    $wanted{$test} = $isnice ? $fn : $fs;
    $unwanted{$test} = $isnice ? $fs : $fn;
    # count number of ranks of each type
    $wranks{$wanted{$test}} = 1;
    $uranks{$unwanted{$test}} = 1;
  }
}

if ($opt_p) {
  $opt_x = 1;
  my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
  my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;

  # first half of ranking is the wanted rank
  my $position = 0;
  my $last = undef;
  for my $test (@wanted) {
    $position++ if defined $last && $last != $wanted{$test};
    $ranking{$test} += $position;
    $last = $wanted{$test}
  }

  # second half of ranking is the unwanted rank
  my $normalize = (scalar keys %wranks) / (scalar keys %uranks);
  $position = 0;
  $last = undef;
  for my $test (@unwanted) {
    $position++ if defined $last && $last != $unwanted{$test};
    $ranking{$test} += ($position * $normalize);
    $last = $unwanted{$test};
    $rank_hi = $ranking{$test} if ($ranking{$test} > $rank_hi);
    $rank_lo = $ranking{$test} if ($ranking{$test} < $rank_lo);
  }
}

$opt_s = 0 if ( !defined $opt_s );
  # now normalise the rankings to [0, 1]
  $rank_hi -= $rank_lo;
  foreach $test (@tests) {
    $ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi);
  }
}

my $ok_lang = lc ( $opt_inclang || $opt_L || '');
$ok_lang = '.' if ($ok_lang eq 'all');
  next if (!$opt_a && $rules{$test}->{issubrule});

my $greprules = sub { # To determine whether rule should be read
  my ($name, $rule) = @_;
  my $fa = $fs+$fn;

  return 0 if ($opt_m && $name !~ /$opt_m/); # name doesn't match -m
                                             # expression
  return 0 if ($opt_t && $rule->{tflags} !~ /$opt_t/); # tflags don't
                                                       # match -t
                                                       # expression
  return 0 if (($opt_L && !$rule->{lang}) ||
	   ($rule->{lang} &&
	    (!$ok_lang || $rule->{lang} !~ /^$ok_lang/i))); # Wrong language

  return 0 if ($rule->{issubrule} && !$opt_a);

  if (!$opt_a && !$opt_t) {
    return 0 if ($rule->{tflags} =~ /net/ && ($opt_s % 2 == 0));
    return 0 if ($rule->{tflags} =~ /userconf/); # or userconf
  }
  return 1;

};
  my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
  my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;

  if ($opt_f && $fsadj == 0 && $fnadj == 0) { next; }

my $logfile = $opt_l || "masses.log";
    $fa = ($fa / ($num_spam + $num_ham)) * 100.0;
    $fs = $fsadj;
    $fn = $fnadj;
  }

if (!$opt_c || !scalar(@$opt_c)) {
    # Try to read this in from the log, if possible
    open IN, $logfile or die "Can't open $logfile: $!";
    my $files = 0; # are we in the files section?
    while(<IN>) {
	if (!$files) {
	    if (/^\# SVN revision:/) {
		$opt_c = [ "$FindBin::Bin/../rules" ];
		last;
	    } elsif (/^\# Using configuration:$/) {
		$files = 1;
	    }
	} elsif (/^\#\s+(.*)\s*$/) {
	    push (@$opt_c, $1);
	} else {
	    # All done!
	    last;
	}
    }

    foreach my $file (@$opt_c) {
	die "Can't read $file" unless -r $file;
    }

  } elsif ($opt_x) {
    printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  %s\n",
  	$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test;

  } else {
    printf "%10d  %10d  %10d  %s\n", $fa, $fs, $fn, $test;
  }
}
	    
my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
					       scoreset => $opt_s,
                                               falsesonly => $opt_f,
                                               greprules => $greprules,
                                               logfile => $logfile,
                                               nologs => 1});

$masses->readrules();
$masses->readlogs();
$masses->do_statistics();
$masses->do_rank();

my $rules = $masses->get_rules_hash();
my $num_ham = $masses->get_num_ham();
my $num_spam = $masses->get_num_spam();
my $num_all = $num_ham + $num_spam;

if ($num_ham + $num_spam <= 0) {
  die "Can't run hit-frequencies on 0 messages.";
}

## Write header
    open (IN, "<$file") || die "Could not open file '$file': $!";

if ($opt_p) {

  if ($opt_f) {
    printf "%7s %7s %7s  %6s  %6s  %6s  %s\n",
  	"OVERALL%", "FNEG%", "FPOS%", "S/O", "RANK", "SCORE", "NAME";
  } else {
    printf "%7s %7s  %7s  %6s  %6s  %6s  %s\n",
  	"OVERALL%", "SPAM%", "HAM%", "S/O", "RANK", "SCORE", "NAME";
  }

  printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
  	$num_all, $num_spam, $num_ham,
        $num_spam / $num_all, 0, 0;
      $_ = $4; s/,,+/,/g;

  printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  (all messages as %%)\n",
  	100.0, $num_spam / $num_all * 100.0, $num_ham / $num_all * 100.0,
        $num_spam / $num_all, 0, 0;
        } else {
          $num_spam++;
        }
      } else {
        if ($opt_f) {
          if ($caught) { $num_ham++; }
        } else {
          $num_ham++;
        }
      }

} elsif ($opt_x) {
  printf "%7s %7s  %7s  %6s  %6s %6s  %s\n",
  	"OVERALL", "SPAM", "HAM", "S/O", "RANK", "SCORE", "NAME";
  printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  (all messages)\n",
  	$num_all, $num_spam, $num_ham,
        $num_spam / $num_all, 0, 0;
          } else {
            $freq_spam{$t}++;
          }
	} else {
          if ($opt_f) {
            if ($caught) { $freq_ham{$t}++; }
          } else {
            $freq_ham{$t}++;
          }
	}
      }
    }
    close IN;
  }
}

} else {
  printf "%10s  %10s  %10s  %s\n",
  	"OVERALL", "SPAM", "HAM", "NAME";
  printf "%10d  %10d  %10d  (all messages)\n",
  	$num_all, $num_spam, $num_ham;
}

foreach my $test (sort { $rules->{$b}->{rank} <=> $rules->{$a}->{rank} } keys %{$rules}) {
  my ($s, $n) = @_;

  if ($opt_p) {
    printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  %s\n",
  	($rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham}) / $num_all * 100.0,
        $rules->{$test}->{spam_percent}, $rules->{$test}->{ham_percent},
        $rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test;
  } elsif ($opt_x) {
    printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  %s\n",
  	$rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham},
        $rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham},
        $rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test;
  } else {
    printf "%10d  %10d  %10d  %s\n",
        $rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham},
        $rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham}, $test;
  }
}

sub tcr {
  my ($nspam, $nlegit, $nspamspam, $nlegitspam) = @_;
  my $nspamlegit = $nspam - $nspamspam;
  my $nlegitlegit = $nlegit - $nlegitspam;

  my $lambda = 99;

  my $werr = ($lambda * $nlegitspam + $nspamlegit)
                  / ($lambda * $nlegit + $nspam);

  my $werr_base = $nspam
                  / ($lambda * $nlegit + $nspam);

  $werr ||= 0.000001;     # avoid / by 0
  my $tcr = $werr_base / $werr;
  return $tcr;
}




# limitations under the License.
# </@LICENSE>

=head1 NAME

rewrite-cf-with-new-scores - Rewrite SpamAssassin scores file with new
scores.

=head1 SYNOPSIS

rewrite-cf-with-new-scores [options]

  Options
  --old-scores=file    Read file containing the old SpamAssassin scores
  --new-scores=file    Read file containing the new SpamAssassin scores
  -s,--scoreset n      Rewrite scoreset n
  --output=file        Output rewritten score file to file
  -c,--cffile=path     Use path as the rules directory
  -l,--logfile=file    Use file instead of masses.log (for guessing -c)

 Note: these options can be shortened (i.e. --old, --new, --out) as
 long as they are unambiguous.

=head1 DESCRIPTION

B<rewrite-cf-with-new-scores> is a tool to update the sitewide scores
file with the newly generated scores. Since SpamAssassin has four
different scoresets, which each need to be generated separately, this
tool is used to only change the correct scoreset.

By default, the old scores are read from 50_scores.cf in the rules
directory and the new ones from ./perceptron.scores. The output will
be ./50_scores.cf by default.

The rules directory needs to be used to make sure scores are given for
the right tests. Rules not found in the rules directory will not be
given scores in the output.

=head1 BUGS

Please report bugs to http://bugzilla.spamassassin.org/

=head1 SEE ALSO

L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>

=cut

use FindBin;
use lib "$FindBin::Bin/../lib";
use Getopt::Long qw(:config bundling auto_help);
use Mail::SpamAssassin::Masses;
use Pod::Usage;
use strict;
use warnings;

use vars qw($opt_old $opt_new $opt_scoreset $opt_out $opt_c $opt_l);

GetOptions("old-scores=s" => \$opt_old,
	   "new-scores=s" => \$opt_new,
	   "s|scoreset=i" => \$opt_scoreset,
	   "output=s" => \$opt_out,
	   "c|cffile=s@" => \$opt_c,
	   "l|logfile=s" => \$opt_l);

$opt_l ||= "masses.log";
$opt_scoreset = 0 unless defined $opt_scoreset;

my $NUM_SCORESETS = 4;

if (!$opt_c || !scalar(@$opt_c)) {
    # Try to read this in from the log, if possible
    open IN, $opt_l or die "Can't open $opt_l: $!";
    my $files = 0; # are we in the files section?
    while(<IN>) {
	if (!$files) {
	    if (/^\# SVN revision:/) {
		$opt_c = [ "$FindBin::Bin/../rules" ];
		last;
	    } elsif (/^\# Using configuration:$/) {
		$files = 1;
	    }
	} elsif (/^\#\s+(.*)\s*$/) {
	    push (@$opt_c, $1);
	} else {
	    # All done!
	    last;
	}
    }

    foreach my $file (@$opt_c) {
	die "Can't read $file" unless -r $file;
    }
}

if (!$opt_old) {
  $opt_old = $$opt_c[0] . "/50_scores.cf";
  # Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem
  require "./tmp/rules.pl";
}
else {
  die "parse-rules-for-masses had no error but no tmp/rules.pl!?!";
}

$opt_new ||= "50_scores.cf";

my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
					       scoreset => $opt_scoreset});

$masses->readrules();
my $rules = $masses->get_rules_hash();

# now read the generated scores
my @gascoreorder = ();
my %oldscores = ();
my %gascorelines = ();
open (STDIN, "<$opt_new") or die "cannot open $opt_new";
while (<STDIN>) {
  /^score\s+(\S+)\s+(-?\d+(?:\.\d+)?)/ or next;
  my $name = $1;  my $score = $2;
  next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule});
  next if ($name =~ /^__/);
  next if ($name eq '(null)');	# er, oops ;)


  push (@gascoreorder, $name);
}

open (IN, "<$opt_old") or die "cannot open $opt_old";
my $out = '';
my $pre = '';


while (<IN>) {
  if (/^\s*score\s+(\S+)\s/) {
    delete $gascorelines{$1};
    next unless (exists ($rules->{$1}) && $rules->{$1}->{issubrule} == 0);
  }
  $pre .= $_;
  /^# Start of generated scores/ and last;

  if (/^\s*score\s+\S+/) {
    my($score,$name,@scores) = split;

    next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule});
    if (defined $gascorelines{$name}) {
      # Set appropriate scoreset value
      $scores[$opt_scoreset] = $gascorelines{$name};

      # Create new score line
      $_ = join(" ","score",$name,generate_scores(@scores))."\n";

}
close IN;

open OUT, ">$opt_out" or die "Can't open $opt_out: $!";

# and output the lot
print OUT $pre, "\n";
foreach my $name (@gascoreorder) {
  $_ = $gascorelines{$name};
  next unless (defined ($_));

  @scores = @{$oldscores{$name}} if ( exists $oldscores{$name} );

  # Set appropriate scoreset value
  $scores[$opt_scoreset] = $_;

  # Create new score line
  print OUT join(" ","score",$name,generate_scores(@scores)),"\n";
}
print OUT "\n", $out, "\n";

sub generate_scores {
  my (@scores) = @_;





# What rule scoreset are we using?
SCORESET =	0
LOGFILE =	masses.log

#### Should be no need to modify below this line

all: badrules perceptron

perceptron: perceptron.o
	$(CC) -o perceptron perceptron.o $(LDFLAGS) 

perceptron.o: tmp/tests.h
	$(CC) $(CFLAGS) -c -o perceptron.o perceptron.c

tmp/tests.h: tmp/.created logs-to-c
	perl logs-to-c --scoreset=$(SCORESET) --logfile=$(LOGFILE)

freqs: masses.log
	perl hit-frequencies -x -p -s $(SCORESET) --logfile=$(LOGFILE) > freqs

tmp/scores.h: tmp/tests.h

tmp/ranges.data: tmp/.created freqs score-ranges-from-freqs
	perl score-ranges-from-freqs ../rules $(SCORESET) < freqs

freqs: spam.log ham.log
	perl hit-frequencies -x -p -s $(SCORESET) > freqs

badrules: freqs
	perl lint-rules-from-freqs -s $(SCORESET) --logfile=$(LOGFILE) > badrules

tmp/.created:
	-mkdir tmp




# limitations under the License.
# </@LICENSE>

=head1 NAME

lint-rules-from-freqs - Try to find problems with SpamAssassin rules

=head1 SYNOPSIS

lint-rules-from-freqs [options]

 Options:
    -c,--cffile=path	  Use path as the rules directory
    -s,--scoreset=n	  Use scoreset n
    -l,--logfile=file	  Read in file instead of masses.log
    -f			  Also take into account false positives/negatives

=head1 DESCRIPTION

This script analyzes SpamAssassin tests, based on the hit frequencies
and S/O ratios from a mass-check log (masses.log).  This script can
also optionally take into account the false positive/negative
frequencies.

The script first uses the SpamAssassin rules parser to report on any
illegal syntax. Then it checks the rules match frequencies from the
mass-check log in order to determine how effective the rule is.

=head1 BUGS

Please report bugs to http://bugzilla.spamassassin.org/

=head1 SEE ALSO

L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>

=cut


use FindBin;
use lib "$FindBin::Bin/../lib";
use Mail::SpamAssassin::Masses;
use Mail::SpamAssassin;
use Getopt::Long qw(:config bundling auto_help);
use strict;
use warnings;

# any tests that get less than this % of matches on *both* spam or nonspam, are
# reported.
my $LOW_MATCHES_PERCENT = 0.03;
my $scoreset = 0;

use vars qw($opt_c $opt_l $opt_s $opt_f $opt_p);

GetOptions("c|cffile=s@" => \$opt_c,
	   "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
	   "l|logfile=s" => \$opt_l,
	   "f|falses" => \$opt_f);


$opt_s = 0 unless defined $opt_s;
$opt_l ||= "masses.log";

sub usage {
  die "
lint-rules-from-freqs: perform 'lint' testing on SpamAssassin rules and scores

usage: ./lint-rules-from-freqs [-c rules dir] [-l logfile] [-s scoreset] [-f]

 -c p  Use p as rules directory
 -l f  Use f as log fine
 -s n  Use n as score set
 -f    Check also for false positives/negatives

The 'freqs' argument is the frequency of hits in all messages ('hit-frequencies
-x -p' output).

The 'falsefreqs' argument is frequencies of hits in false-positives and
false-negatives only ('hit-frequencies -x -p -f' output).

";
}


if (!$opt_c || !scalar(@$opt_c)) {
    # Try to read this in from the log, if possible
    open IN, $opt_l or die "Can't open $opt_l: $!";
    my $files = 0; # are we in the files section?
    while(<IN>) {
	if (!$files) {
	    if (/^\# SVN revision:/) {
		$opt_c = [ "$FindBin::Bin/../rules" ];
		last;
	    } elsif (/^\# Using configuration:$/) {
		$files = 1;
	    }
	} elsif (/^\#\s+(.*)\s*$/) {
	    push (@$opt_c, $1);
	} else {
	    # All done!
	    last;
	}
    }

    foreach my $file (@$opt_c) {
	die "Can't read $file" unless -r $file;
    }
}

print "BAD TESTS REPORT\n";
# First, do a --lint

print "\nRule file syntax issues:\n\n";
lintrules();

{
  local (*STDERR) = \*STDOUT; # Get lint errors on STDOUT

  # Read the config ourselves...

  # Read init.pre from each directory, then glob for the rest.

  my $cf_txt = '';
  my @files;
  my @dirs;
  foreach my $file (@$opt_c) {
    if (-d $file) {
      if  (-r "$file/init.pre") {
	push @files, "$file/init.pre";
      }
      push @dirs, $file;
    }
    else {
      push @files, $file;
    }
  }
  foreach my $dir (@dirs) {
    my @cfs = glob("$dir/*.cf");
    push @files, grep { -r $_ } @cfs;
  }

  foreach my $file (@files) {
    if (-r $file) {
      open IN, $file;
      $cf_txt .= "file start $file\n";
      $cf_txt .= join('', <IN>);
      $cf_txt .= "\nfile end $file\n";
      close IN;
    }
  }

  my $spamtest = new Mail::SpamAssassin({config_text => $cf_txt});

  $spamtest->lint_rules();
}


# Next, check for other stuff
my $masses = Mail::SpamAssassin::Masses->new({rulesdir => $opt_c,
					      scoreset => $opt_s, #,,
					      falses => $opt_f,
					      logfile => $opt_l});

$masses->readlogs();
$masses->do_statistics();

my $rules = $masses->get_rules_array();


my %output;

foreach my $rule (@$rules) {

  my $badrule;
  my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
  next unless ($name =~ /\S/);

  next if ($rule->{tflags} =~ /\bnet\b/ && ($opt_s % 2) == 0);
  next if ($rule->{tflags} =~ /\buserconf\b/);
  my $ffso = $falsefreqs_so{$name};

  if ($rule->{freq_spam} == 0 && $rule->{freq_ham} == 0) {        # sanity!
  next if ($tf =~ /net/ && ($scoreset % 2) == 0);
  next if ($tf =~ /userconf/);

  if ($overall == 0.0 && $spam == 0.0 && $nons == 0.0) {        # sanity!
    $badrule = 'no matches';

  } else {
    if ($rule->{score} < 0.0) {
      # negative score with more spams than nonspams? bad rule.
      if (!$rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) {
        $badrule = 'non-nice but -ve score';
      }
      if ($rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) {
        if ($opt_f && $rule->{freq_fn} < $rule->{freq_fp}) {
        if ($ffso < 0.5) {
          $badrule = 'fn';
        } else {
          # ignore, the FNs are overridden by other tests so it doesn't
          # affect the overall results.
        }
        # else {
        # ignore, the FNs are overridden by other tests so it doesn't
        # affect the overall results.
        # }
      }

      # low number of matches overall
      if ($rule->{ham_percent} < $LOW_MATCHES_PERCENT)
                 { $badrule ||= ''; $badrule .= ', low matches'; }

    } elsif ($rule->{score} > 0.0) {
      # positive score with more nonspams than spams? bad.
      if ($rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) {
        $badrule = 'nice but +ve score';
      }
 
      if (!$rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) {
        if ($opt_f && $rule->{freq_fp} > $rule->{freq_fn}) {
          $badrule = 'fp';
        } else {
          # ignore, the FPs are overridden by other tests so it doesn't
          # affect the overall results.
        }
        # else {
        # ignore, the FPs are overridden by other tests so it doesn't
        # affect the overall results.
        # }
      }
 
      # low number of matches overall
      if ($rule->{spam_percent} < $LOW_MATCHES_PERCENT)
                 { $badrule ||= ''; $badrule .= ', low matches'; }
 
    } elsif ($rule->{score} == 0.0) {
      $badrule = 'score is 0';
    }
  }
 
  if (defined $badrule) {
    $badrule =~ s/^, //;
    $output{$badrule} .= $rule->{name} . " ($badrule)\n";
  }
}


exit;


sub concat_rule_lang {
  my $rule = shift;
  my $lang = shift;

  if (defined $lang && $lang ne '') {
    return "[$lang]_$rule";
  } else {
    return $rule;
  }
}

# note: do not use parse-rules-for-masses here, we need to do linting instead
# of your average parse
sub readrules {
  my @files = <../rules/[0-9]*.cf>;
  my $file;
  %rulesfound = ();
  %langs = ();
  foreach $file (@files) {
    open (IN, "<$file");
    while (<IN>) {
      s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;

      # make all the foo-bar stuff foo_bar
      1 while s/^(\S+)-/\1_/g;
      1 while s/^(lang\s+\S+\s+\S+)-/\1_/g;

      my $lang = '';
      if (s/^lang\s+(\S+)\s+//) {
        $lang = $1; $langs{$1} = 1;
      }

      if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) {
        $rulesfound{$2} = 1;
        $rulefile{$2} ||= $file;
        $scorefile{$1} = $file;
        $score{$2} ||= 1.0;
        $tflags{$2} ||= '';
        $descfile{$2} ||= $file;       # a rule with no score or desc is OK
	$description{$2}->{$lang} = undef;

        if (/^body\s+\S+\s+eval:/) {
          # ignored
        } elsif (/^body\s+\S+\s+(.*)$/) {
          my $re = $1;

	  # If there's a ( in a rule where it should be (?:, flag it.
	  # but ignore [abc(] ...
          if ($re =~ /[^\\]\([^\?]/ && $re !~ /\[[^\]]*[^\\]\(/) { 
            print "warning: non-(?:...) capture in regexp in $file: $_\n";
          }
          if ($re =~ /\.[\*\+]/) { 
            print "warning: .* in regexp in $file: $_\n";
          }
          if ($re =~ /[^\\]\{(\d*),?(\d*?)\}/) {
            if ($1 > 120 || $2 > 120) {
              print "warning: long .{n} in regexp in $file: $_\n";
            }
          }
        }

      } elsif (/^describe\s+(\S+)\s+(.*?)\s*$/) {
        $rulesfound{$1} = 1;
        $descfile{concat_rule_lang ($1, $lang)} ||= $file;
        $descfile{$1} ||= $file;
	$description{$1}->{$lang} = $2;
      } elsif (/^tflags\s+(\S+)\s+(.+)$/) {
        $rulesfound{$1} = 1;
        $tflags{$1} = $2;
        $tflagsfile{concat_rule_lang ($1, $lang)} = $file;
        $tflagsfile{$1} = $file;
      } elsif (/^score\s+(\S+)\s+(.+)$/) {
        $rulesfound{$1} = 1;
        $scorefile{concat_rule_lang ($1, $lang)} = $file;
        $scorefile{$1} = $file;
        $score{$1} = $2;
      } elsif (/^(clear_report_template|clear_spamtrap_template|report|spamtrap|
                clear_terse_report_template|terse_report|
                required_score|ok_locales|ok_languages|test|lang|
                spamphrase|whitelist_from|require_version|
		clear_unsafe_report_template|unsafe_report|
		(?:bayes_)?auto_learn_threshold_nonspam|(?:bayes_)?auto_learn_threshold_spam|
		(?:bayes_)?auto_learn
                )/x) {
        next;
      } else {
        print "warning: unknown rule in $file: $_\n";
      }
    }
    close IN;
  }
  @langsfound = sort keys %langs;
  @rulesfound = sort keys %rulesfound;
}

sub lintrules {
  my %possible_renames = ();

  foreach my $rule (@rulesfound) {
    my $match = $rule;
    $match =~ s/_\d+[^_]+$//gs;    # trim e.g. "_20K"
    $match =~ s/[^A-Z]+//gs;    # trim numbers etc.

    if (defined ($rulefile{$rule}) && $possible_renames{$match} !~ / \Q$rule\E\b/) {
      $possible_renames{$match} .= " ".$rule;
    }
    $possible_rename_matches{$rule} = $match;
  }

  foreach my $lang ('', @langsfound) {
    foreach my $baserule (@rulesfound) {
      next if ( $baserule =~ /^__/ || $baserule =~ /^T_/ );

      my $rule = concat_rule_lang ($baserule, $lang);
      my $f = $descfile{$rule};
      my $warned = '';

      if (defined $f && !defined ($rulefile{$rule})
                && !defined ($rulefile{$baserule}))
      {
        print "warning: $baserule has description, but no rule: $f\n";
        $warned .= ' lamedesc';
      }

	# Check our convention for rule length
	if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && length $baserule > 22 ) {
	  print "warning: $baserule has a name longer than 22 chars: $f\n";
	}
 	# Check our convention for rule length
	if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && defined $description{$baserule}->{$lang} && length $description{$baserule}->{$lang} > 50 ) {
	  print "warning: $baserule has a description longer than 50 chars: $f\n";
	}

      # lang rule trumps normal rule
      $f = $rulefile{$rule} || $rulefile{$baserule};
      # if the rule exists, and the language/rule description doesn't exist ...
      if ( defined $f && !defined $description{$baserule}->{$lang} )
      {
        print "warning: $baserule exists, ",( $lang ne '' ? "lang $lang, " : "" ),"but has no description: $f\n";
        $warned .= ' lamedesc';
      }


      $f = $scorefile{$rule};
      if (defined $f && !defined ($rulefile{$rule})
                && !defined ($rulefile{$baserule}))
      {
        print "warning: $baserule has score, but no rule: $f\n";
        $warned .= ' lamescore';
      }

      my $r = $possible_rename_matches{$rule};
      if ($warned ne '' && defined $r) {
        my @matches = split (' ', $possible_renames{$r});
        if (scalar @matches != 0) {
          my $text = '';

          # now try and figure out "nearby" rules with no description/score
          foreach my $baser (@matches) {
            my $blang;
            if ($descfile{$rule} =~ /text_(\S\S)\./) {
              $blang = $1;
            }
            my $r = concat_rule_lang ($baser, $blang);
            #warn "$r $descfile{$r} $descfile{$baser}";
            next if ($warned =~ /lamedesc/ && (defined $descfile{$r}));
            next if ($warned =~ /lamescore/ && (defined $scorefile{$r}));
            $text .= " $baser";
          }

          if ($text ne '') {
            print "warning: (possible renamed rule? $text)\n";
          }
        }
      }
    }
  }
}





# limitations under the License.
# </@LICENSE>

=head1 NAME
  die <<ENDOFUSAGE;
usage: mass-check [options] target ...
 
  -c=file       set configuration/rules directory
  -p=dir        set user-prefs directory
  -f=file       read list of targets from <file>
  -j=jobs       specify the number of processes to run simultaneously
  --net         turn on network checks!
  --mid         report Message-ID from each message
  --debug       report debugging information
  --progress    show progress updates during check
  --rewrite=OUT save rewritten message to OUT (default is /tmp/out)
  --showdots    print a dot for each scanned message
  --rules=RE    Only test rules matching the given regexp RE
  --restart=N   restart all of the children after processing N messages
  --deencap=RE  Extract SpamAssassin-encapsulated spam mails only if they
                were encapsulated by servers matching the regexp RE
                (default = extract all SpamAssassin-encapsulated mails)
 
  log options
  -o            write all logs to stdout
  --loghits     log the text hit for patterns (useful for debugging)
  --loguris	log the URIs found
  --hamlog=log  use <log> as ham log ('ham.log' is default)
  --spamlog=log use <log> as spam log ('spam.log' is default)
 
  message selection options
  -n            no date sorting or spam/ham interleaving
  --after=N     only test mails received after time_t N (negative values
                are an offset from current time, e.g. -86400 = last day)
                or after date as parsed by Time::ParseDate (e.g. '-6 months')
  --before=N    same as --after, except received times are before time_t N
  --all         don't skip big messages
  --head=N      only check first N ham and N spam (N messages if -n used)
  --tail=N      only check last N ham and N spam (N messages if -n used)
 
  simple target options (implies -o and no ham/spam classification)
  --dir         subsequent targets are directories
  --file        subsequent targets are files in RFC 822 format
  --mbox        subsequent targets are mbox files
  --mbx         subsequent targets are mbx files
 
  Just left over functions we should remove at some point:
  --bayes       report score from Bayesian classifier
 
  non-option arguments are used as target names (mail files and folders),
  the target format is: <class>:<format>:<location>
  <class>       is "spam" or "ham"
  <format>      is "dir", "file", "mbx", or "mbox"
  <location>    is a file or directory name.  globbing of ~ and * is supported

mass-check - Generates SpamAssassin scores and results for large
amounts of mail

=head1 SYNOPSIS

 mass-check [options] class:format:location ...
 mass-check [options] {--dir | --file | --mbox} target ...
 mass-check [options] -f file

  Options:
    -f=file       read list of targets from <file>
    -j=jobs       specify the number of processes to run simultaneously
    --net         turn on network checks!
    --mid         report Message-ID from each message
    --debug       report debugging information
    --progress    show progress updates during check
    --rewrite=OUT save rewritten message to OUT (default is /tmp/out)
    --showdots    print a dot for each scanned message
    --rules=RE    Only test rules matching the given regexp RE
    --restart=N   restart all of the children after processing N messages

    SpamAssassin options
    -c=dir        set configuration/rules directory
    -p=file       set user preferences file (default: none)
    -s=dir        set site rules configuration directory
    -u=dir        set user-state directory
    --dist        assumes the script is being run from the masses/ dir of
                  the unpacked tarball, and makes appropriate guesses for
                  -p and -c
    --deencap=RE  Extract SpamAssassin-encapsulated spam mails only if they
                  were encapsulated by servers matching the regexp RE
                  (default = extract all SpamAssassin-encapsulated mails)

    log options
    -o            write all logs to stdout
    --loghits     log the text hit for patterns (useful for debugging)
    --loguris	  log the URIs found
    --log=file    log to <file> (masses.log is default)

    message selection options
    -n            no date sorting or spam/ham interleaving
    --after=N     only test mails received after time_t N (negative values
                  are an offset from current time, e.g. -86400 = last day)
                  or after date as parsed by Time::ParseDate (e.g. '-6 months')
    --before=N    same as --after, except received times are before time_t N
    --all         don't skip big messages
    --head=N      only check first N ham and N spam (N messages if -n used)
    --tail=N      only check last N ham and N spam (N messages if -n used)

    simple target options (implies -o and no ham/spam classification)
    --dir         subsequent targets are directories
    --file        subsequent targets are files in RFC 822 format
    --mbox        subsequent targets are mbox files
    --mbx         subsequent targets are mbx files

    Just left over functions we should remove at some point:
    --bayes       report score from Bayesian classifier
    --hamlog=log  use <log> as ham log ('ham.log' is default)
    --spamlog=log use <log> as spam log ('spam.log' is default)

=head1 DESCRIPTION

B<mass-check> is designed to assist with rule development and
generation of SpamAssassin scored. It reads in mail from the
location(s) specified on the command line (in the first form above),
given in the form I<class:format:location>, where I<class> is either
"spam" or "ham" (non-spam), I<format> is one of "dir" (Maildirs, MH,
etc), "file", "mbox" (mboxes can be gzipped) or "mbx".

B<mass-check> will analyze each message using SpamAssassin and
generate one-line of output per message, (by default to masses.log) in
the following format:

 {s|h} {s|h} score filename tests-hit

The first field is the message's class as given on the command line
(ham or spam). The second is the message's class as determined by
SpamAssassin. The third is the message's score, as determined by
SpamAssassin. The fourth field contains the message's filename; for
mboxes, this contains the filename and the byte offset from the
beginning of the file separated by a period. The last field contains a
list of all the tests the message hit separated by commas.

If you want to run this on the currently installed version of
SpamAssassin's rules for sitewide use, make sure your user_prefs file
contains no rules.

=head1 BUGS

Please report bugs to http://bugzilla.spamassassin.org/

=head1 SEE ALSO

L<hit-frequencies(1)>, L<logs-to-c(1)>, L<Mail::SpamAssassin::Masses(3)>,
L<perceptron(1)>

=cut

###########################################################################

use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all
	    $opt_bayes $opt_before $opt_debug $opt_dist $opt_format
	    $opt_hamlog $opt_head $opt_log $opt_loghits $opt_mid
	    $opt_mh $opt_ms $opt_net $opt_nosort $opt_p $opt_progress
	    $opt_s $opt_showdots $opt_spamlog $opt_tail $opt_rules
	    $opt_restart $opt_loguris $opt_after $opt_rewrite $opt_u
	    $opt_deencap);

use FindBin;
use lib "$FindBin::Bin/../lib";
eval "use bytes";
use Mail::SpamAssassin::ArchiveIterator;
use Mail::SpamAssassin;
use Getopt::Long qw(:config bundling auto_help);
use POSIX qw(strftime);
use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };
use strict; # Why wasn't this on?
use Config;

# default settings

$opt_p = "$FindBin::Bin/spamassassin";
$opt_j = 1;
$opt_net = 0;
$opt_log = "masses.log";
$opt_spamlog = "spam.log";

GetOptions("c=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug",
	   "deencap=s", "dist!", "hamlog=s", "head=i", "log=s",
	   "loghits", "mh", "mid", "ms", "net", "p=s", "progress",
	   "rewrite:s", "s=s", "showdots", "spamlog=s", "tail=i",
	   "rules=s", "restart=i", "u=s", "after=s", "loguris",
	   "dir" => sub { $opt_format = "dir"; },
	   "file" => sub {$opt_format = "file"; },
	   "mbox" => sub { $opt_format = "mbox"; },
	   "mbx" => sub { $opt_format = "mbx"; },
	   '<>' => \&target) or usage();

if ($opt_hamlog || $opt_spamlog) { # Old style logging
  $opt_hamlog ||= "ham.log";
  $opt_spamlog ||= "spam.log";
}

my @targets;

if ($opt_f) {
  open(F, $opt_f) || die $!;
  push(@targets, map { chomp; $_ } <F>);


if (scalar @targets == 0) { usage(); }

# Auto-detect --dist option
if (!defined $opt_dist) {
  if (-f "$FindBin::Bin/../spamassassin.raw") {
    warn "Automatically using --dist. Assuming you are running from the unpacked tarball. Use --no-dist to override.";
    $opt_dist = 1;
  }
}

my $local_rules_dir;
  'debug'              			=> $opt_debug,
  'rules_filename'     			=> $opt_c,
  'userprefs_filename' 			=> "$opt_p/user_prefs",
  'site_rules_filename'			=> "$opt_p/local.cf",
  'userstate_dir'     			=> "$opt_p",
  'save_pattern_hits'  			=> $opt_loghits,
  'dont_copy_prefs'   			=> 1,
  'local_tests_only'   			=> $opt_net ? 0 : 1,
  'only_these_rules'   			=> $opt_rules,
  'ignore_safety_expire_timeout'	=> 1,
  PREFIX				=> '',
  DEF_RULES_DIR        			=> $opt_c,
  LOCAL_RULES_DIR      			=> '',
});

if ($opt_dist) { # Set defaults
  $opt_c ||= "$FindBin::Bin/../rules";
  $opt_p ||= "$FindBin::Bin/mass-check.cf";
  $opt_u ||= "$FindBin::Bin/spamassassin";
  $opt_s ||= "$FindBin::Bin/spamassassin";
  $local_rules_dir = '';
}
else {
  if(!$opt_u) {
    # Assuming this is OK, since mass-check isnt supported on windows, is it?
    # Also, should there be some check to make sure that previous mass-check stuff isn't in there?
    # AFAICT, there isn't otherwise....
    if ( -d "${ENV{HOME}}/.spamassassin" ) {
      $opt_u = "${ENV{HOME}}/.spamassassin/mass-check";
      mkdir $opt_u, 0700 if (! -d $opt_u);
    }
  }

# Leave the rest to SA, we'll get it afterwards

}

my $spamtest = new Mail::SpamAssassin ({
				       'debug'              			=> $opt_debug,
				       'rules_filename'     			=> $opt_c,
				       'userprefs_filename' 			=> $opt_p,
				       'site_rules_filename'			=> $opt_s,
				       'userstate_dir'     			=> $opt_u,
				       'save_pattern_hits'  			=> $opt_loghits,
				       'dont_copy_prefs'   			=> 1,
				       'local_tests_only'   			=> $opt_net ? 0 : 1,
				       'only_these_rules'   			=> $opt_rules,
				       'ignore_safety_expire_timeout'	=> 1,
				       DEF_RULES_DIR        			=> $opt_c,
				       LOCAL_RULES_DIR      			=> $local_rules_dir,
				      });

$spamtest->compile_now(1);
if ($opt_dist) {
  $spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf");
}

my $who   = `id -un 2>/dev/null`;   chomp $who;
my $where = `uname -n 2>/dev/null`; chomp $where;
my $when  = `date -u`;              chomp $when;
my $revision;

if ($opt_dist) {
  my $rev = "unknown";
  if (open(TESTING, "$opt_c/70_testing.cf")) {
    chomp($rev = <TESTING>);
    $rev =~ s/.*\$Rev:\s*(\S+).*/$1/;
    close(TESTING);
  }
  $revision = "SVN revision: $rev";
}
else {
  $revision = "Local";
}

my $log_header = "# mass-check results from $who\@$where, on $when\n" .
		 "# M:SA version ".$spamtest->Version()."\n" .
		 "# $revision\n" .
		 "# Perl version: $] on $Config{archname}\n";

if (!$opt_dist) {
  my @paths = ( $spamtest->{rules_filename}, $spamtest->{site_rules_filename}, $spamtest->{userprefs_filename} );
  $log_header .= "# Using configuration:\n";
  foreach my $file (@paths) {
    $log_header .=  "# $file\n";
  }
}

my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || `hostname` || 'localhost';
chomp $host;


    autoflush STDOUT 1;
    print STDOUT $log_header;
  }
  elsif ($opt_hamlog || $opt_spamlog) {
    open(HAM, "> $opt_hamlog");
    open(SPAM, "> $opt_spamlog");
    autoflush HAM 1;

    print HAM $log_header;
    print SPAM $log_header;
  }
  else {
    open(OUT, "> $opt_log");
    autoflush OUT 1;
    print OUT $log_header;
  }
  $init_results = 1;
}


  # don't open results files until we get here to avoid overwriting files
  &init_results if !$init_results;

  if ($opt_o) {
    print STDOUT $result;
    $spam_count++;
  }
  elsif ($opt_spamlog || $opt_hamlog) {
    if ($class eq "s") {
      print SPAM $result;
    } else {
      print HAM $result;
    }
  }
  else {
    print OUT $result;
  }

  $total_count++;
#warn ">> result: $total_count $class $time\n";

  if ($opt_progress) {
    if ($class eq "s") {
      $spam_count++;
    }
    else {
      $ham_count++;
    }
    progress($time);
  }
}

sub wanted {
  my ($class, $id, $time, $dataref) = @_;
  my $out;

  my $ma = $spamtest->parse($dataref, 1);

    push(@extra, "mid=$mid");
  }

  my $result;
  my $score;
  my $tests;
  my $extra;

  if ($opt_loguris) {
    $result = '.';
    $score = 0;
    $tests = join(" ", sort @uris);
    $extra = '';
  } else {
    if ($status->is_spam()) {
      $result = "s";
    } else {
      $result = "h";
    }
    $score = $status->get_score();
    $tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit())));
    $extra = join(",", @extra);


  $id =~ s/\s/_/g;

  $out .= sprintf("%s %s %05.2f %s %s %s\n", $class, $result, $score, $id, $tests, $extra);

  if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) {
    $out .= logkilled($ma, $id, "possible virus");

Lines 10-16 Link Here

(-)mk-baseline-results (-2 / +2 lines)
10	echo "Classification success on test corpora, at default threshold:"	10	echo "Classification success on test corpora, at default threshold:"
11	echo	11	echo
12		12
13	./logs-to-c --spam=spam-validate.log --nonspam=nonspam-validate.log --threshold 5 --count --scoreset=$SCORESET \| sed -e 's/^Reading.*//' -e '/^$/d'	13	./fp-fn-statistics --logfile=masses-validate.log --threshold 5 --scoreset=$SCORESET \| sed -e 's/^Reading.*//' -e '/^$/d'
14		14
15	echo	15	echo
16	echo "Results on test corpora at various alternative thresholds:"	16	echo "Results on test corpora at various alternative thresholds:"
Lines 18-24 Link Here
18		18
19	# list a wide range of thresholds, so that we can make graphs later ;)	19	# list a wide range of thresholds, so that we can make graphs later ;)
20	for thresh in -4 -3 -2 -1 0 1 2 3 4 4.5 5.5 6 6.5 7 8 9 10 12 15 17 20 ; do	20	for thresh in -4 -3 -2 -1 0 1 2 3 4 4.5 5.5 6 6.5 7 8 9 10 12 15 17 20 ; do
21	./logs-to-c --spam=spam-validate.log --nonspam=nonspam-validate.log --threshold $thresh --count --scoreset=$SCORESET \| sed -e 's/^Reading.*//' -e '/^$/d'	21	./logs-to-c --logfile=masses-validate.log --threshold $thresh --scoreset=$SCORESET \| sed -e 's/^Reading.*//' -e '/^$/d'
22	echo	22	echo
23	done	23	done
24		24




  This script is used to perform "mass checks" of a set of mailboxes, Cyrus
  folders, and/or MH mail spools.  It generates summary lines like this:

  s s 07.22 /home/jm/Mail/Sapm/1382 SUBJ_ALL_CAPS,SUPERLONG_LINE,SUBJ_FULL_OF_8BITS

  or for mailboxes,

  h h 01.32 /path/to/mbox:<5.1.0.14.2.20011004073932.05f4fd28@localhost> TRACKER_ID,BALANCE_FOR_LONG

  listing the path to the message or its message ID, its score, and the tests
  that triggered on that mail.

  get good hits with few false positives, etc., and re-score the tests to
  optimise the ratio.

  If given the --dist option, this script relies on the spamassassin
  distribution directory living in "..". If this script is not in the
  distribution directory, it will generate logs based on the site-wide
  rules, as well as personal rules.


logs-to-c :

  Takes the "masses.log" file and converts them into C source files
  and simplified data files for use by the C score optimization
  algorithm.  (Called by "make" when you build the perceptron, so
  generally you won't need to run it yourself.)


hit-frequencies :

  Analyses the log files and computes how often each test hits, overall,
  for spam mails and for non-spam.


mk-baseline-results :

  Compute results for the baseline scores (read from ../rules/*).  If you

  It will output statistics on the current ruleset to ../rules/STATISTICS.txt,
  suitable for a release build of SpamAssassin.


perceptron.c :

  Perceptron learner by Henry Stern.  See "README.perceptron" for details.




#!/usr/bin/perl -w
#
# <@LICENSE>
# Copyright 2004 Apache Software Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
 
 
use FindBin;
use lib "$FindBin::Bin/../lib";
use Mail::SpamAssassin::Masses;
use Getopt::Long;
use strict;
use warnings;

use vars qw{$opt_c $opt_l $opt_s $opt_h $opt_t $opt_lambda};

GetOptions("c|cffile=s@" => \$opt_c, "l|logfile=s" => \$opt_l,
"s|scoreset=i" => \$opt_s, "h|help" => \$opt_h, "t|threshold" =>
\$opt_t, "lambda" => \$opt_lambda);


# Need to add rule for scores file 
sub usage {
  die "logs-to-c [-c rules dir] [-l log file] [-s SS] [-t]

    -c,--cffile   p   use p as rules directory
    -s,--scoreset SS  use scoreset SS
    -l,--log      log read logs from log instead of from masses.log
    -t,--threshold n  use a threshold of n (default: 5)
    --lambda=n        use a lambda value of n (default: 5)
";
}


usage() if $opt_h;

$opt_c ||= "$FindBin::Bin/../rules";
$opt_t = (defined($opt_t) ? $opt_t : 5);
$opt_s ||= 0; #|
$opt_l ||= "masses.log";
$opt_lambda ||= 5;

print "$opt_c used for config";

my $nybias = 10;


my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
                                               scoreset => $opt_s, # ,,
                                               logfile => $opt_l});

$masses->readlogs();

my $logs = $masses->get_logs();

my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore) = (0,0,0,0,0,0,0,0);

my $num_spam = $masses->get_num_spam();
my $num_ham = $masses->get_num_ham();
my $num_logs = $num_spam + $num_ham;

my $count = 0;

my $score;

foreach my $log (@$logs) {

  $score = 0;
  foreach my $test (@{$log->{tests_hit}}) {

    next if ($test->{issubrule});
    next if (!$test->{score});

    $score += $test->{score};

  }

  if ($score >= $opt_t) {
    if ($log->{isspam}) {
      $ga_yy++;
      $yyscore += $score;
    }
    else {
      $ga_ny++;
      $nyscore += $score;
    }
  } else {
    if ($log->{isspam}) {
      $ga_yn++;
      $ynscore += $score;
    }
    else {
      $ga_nn++;
      $nnscore += $score;
    }
  }
}

$nybias = $nybias * ($num_spam / $num_ham);

my $fprate = ($ga_ny / $num_logs) * 100.0;
my $fnrate = ($ga_yn / $num_logs) * 100.0;

printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_t);
printf "# Correctly non-spam: %6d  %4.2f%%  (%4.2f%% of non-spam corpus)\n", $ga_nn,
  ($ga_nn /  $num_logs) * 100.0, ($ga_nn /  $num_ham) * 100.0;
printf "# Correctly spam:     %6d  %4.2f%%  (%4.2f%% of spam corpus)\n" , $ga_yy,
  ($ga_yy /  $num_logs) * 100.0, ($ga_yy /  $num_spam) * 100.0;
printf "# False positives:    %6d  %4.2f%%  (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny,
  $fprate, ($ga_ny /  $num_ham) * 100.0, $nyscore*$nybias;
printf "# False negatives:    %6d  %4.2f%%  (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn,
  $fnrate, ($ga_yn /  $num_spam) * 100.0, $ynscore;

# convert to the TCR metrics used in the published lit
my $nspamspam = $ga_yy;
my $nspamlegit = $ga_yn;
my $nlegitspam = $ga_ny;
my $nlegitlegit = $ga_yn;
my $nlegit = $num_ham;
my $nspam = $num_spam;

my $werr = ($opt_lambda * $nlegitspam + $nspamlegit)
  / ($opt_lambda * $nlegit + $nspam);

my $werr_base = $nspam
  / ($opt_lambda * $nlegit + $nspam);

$werr ||= 0.000001;     # avoid / by 0
my $tcr = $werr_base / $werr;

my $sr = ($nspamspam / $nspam) * 100.0;
my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
printf "# TCR: %3.6f  SpamRecall: %3.3f%%  SpamPrec: %3.3f%%  FP: %3.2f%%  FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate;


Lines 68-74 Link Here

(-)extract-message-from-mbox (-1 / +1 lines)
68		68
69	sub masscheck {	69	sub masscheck {
70	while (<STDIN>) {	70	while (<STDIN>) {
71	my $mail = (split(/\s+/, $_))[2];	71	my $mail = (split(/\s+/, $_))[3];
72	$mail =~ tr/_/ /;	72	$mail =~ tr/_/ /;
73	if ($mail =~ /^(.*)\.(\d+)$/) {	73	if ($mail =~ /^(.*)\.(\d+)$/) {
74	extract($1, $2);	74	extract($1, $2);




# limitations under the License.
# </@LICENSE>

=head1 NAME
use vars qw($opt_cffile $opt_count $opt_lambda $opt_threshold
		$opt_spam $opt_nonspam);

logs-to-c - Convert a mass-check log into perceptron format
my $argcffile = $opt_cffile;

=head1 SYNOPSIS
if ($opt_count) { $justcount = 1; }

logs-to-c [options]
if (defined $opt_threshold) { $threshold = $opt_threshold; }

 Options:
    -c,--cffile=path	  Use path as the rules directory
    -s,--scoreset=n	  Use scoreset n
    -l,--logfile=file	  Read in file instead of masses.log
    -o,--outputdir        Put output in the specified dir (default tmp/)

=head1 DESCRIPTION

B<logs-to-c> will read the mass-check log F<masses.log> or as
specified by the B<--logfile> option, and convert it into the format
needed by the perceptron. This is a format that is simple for the
perceptron to parse, but is not very readable to humans.
# infrequently-read folder".

By default, output will be put in the directory ./tmp/ unless another
directory is specified by the B<--outputdir> option. (Note: at the
current time, this must be /tmp/ in order for the perceptron to
compile properly.)

=head1 BUGS
my %tests_hit = ();
my %mutable_tests = ();

Please report bugs to http://bugzilla.spamassassin.org/

=head1 SEE ALSO

L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
my ($num_tests, $num_spam, $num_nonspam);
my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore);

=cut
read_ranges();

use FindBin;
use lib "$FindBin::Bin/../lib";
use Mail::SpamAssassin::Masses;
use Getopt::Long qw(:config bundling auto_help);
use strict;
use warnings;

use vars qw{$opt_c $opt_l $opt_s $opt_o};

GetOptions("c|cffile=s@" => \$opt_c,
	   "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
	   "l|logfile=s" => \$opt_l,
	   "o|output=s" => \$opt_o);


$opt_o ||= "./tmp/";
if (!-d $opt_o) {
  mkdir $opt_o, 0777 or die "Can't mkdir $opt_o";
}
exit 0;

if (!$opt_c || !scalar(@$opt_c)) {
    # Try to read this in from the log, if possible
    open IN, $opt_l or die "Can't open $opt_l: $!";
    my $files = 0; # are we in the files section?
    while(<IN>) {
	if (!$files) {
	    if (/^\# SVN revision:/) {
		$opt_c = [ "$FindBin::Bin/../rules" ];
		last;
	    } elsif (/^\# Using configuration:$/) {
		$files = 1;
	    }
	} elsif (/^\#\s+(.*)\s*$/) {
	    push (@$opt_c, $1);
	} else {
	    # All done!
	    last;
	}
    }

    foreach my $file (@$opt_c) {
	die "Can't read $file" unless -r $file;
    }
}

# ignore rules that are subrules -- we don't generate scores for them...
    $ga_yy = $ga_ny = $ga_yn = $ga_nn = 0;
    $yyscore = $ynscore = $nyscore = $nnscore = 0.0;
  }

# Note: this will cause a difference over the old logs-to-c since rank
# is dependent on the frequencies of all rules, not just non-subrules

my $greprules = sub { return 0 if $_[1]->{issubrule}; return 1; };
      next if /^\#/;
      next if /^$/;
      if($_ !~ /^.\s+([-\d]+)\s+\S+\s*/) { warn "bad line: $_"; next; }
      my $hits = $1;
#my $foo = $_;
      $_ = $'; s/(?:bayes|time)=\S+//; s/,,+/,/g; s/^\s+//; s/\s+$//;

$opt_s ||= 0; # |
      my @tests = ();
      foreach my $tst (split (/,/, $_)) {
	next if ($tst eq '');
	if (!defined $scores{$tst}) {
          #warn "unknown test in $file, ignored: $tst\n";
	  next;
	}

my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
					       scoreset => $opt_s, # ,,
					       logfile => $opt_l,
                                               greprules => $greprules });

$masses->readlogs();
$masses->do_score_ranges();
        } else {
          push (@tests, $tst);
        }
      }

my $rules = $masses->get_rules_array();
my $logs = $masses->get_logs();
      }

my @index_to_rule;
my $num_spam = $masses->get_num_spam();
my $num_ham = $masses->get_num_ham();
          if ($score >= $threshold) {
            $ga_yy++; $yyscore += $score;
          } else {
            $ga_yn++; $ynscore += $score;
          }
        } else {
          $is_spam{$count} = 1;
        }
      } else {
	$num_nonspam++;
        if ($justcount) {
          if ($score >= $threshold) {
#print "$score -- $foo";
            $ga_ny++; $nyscore += $score;
          } else {
            $ga_nn++; $nnscore += $score;
          }
        } else {
          $is_spam{$count} = 0;
        }
      }
      $count++;
    }
    close IN;
  }
  $num_tests = $count;
}

# This is misleading -- num_tests is really num_msgs
my $num_tests = $num_spam + $num_ham;

sub readscores {
  if (!defined $argcffile) { $argcffile = "../rules"; }
  print "Reading scores from \"$argcffile\"...\n";
  system ("./parse-rules-for-masses -d \"$argcffile\" -s $opt_scoreset") and die;
  require "./tmp/rules.pl";
  %allrules = %rules;           # ensure it stays global
}

# Write logs and scores as C code
writescores_c();
writetests_c();


sub writescores_c {

  my $size = 0;
  my $mutable = 0;
  my $output = '';
  my $count = 0;
  my $score = 0;

  foreach my $rule (sort {($b->{ismutable} <=> $a->{ismutable}) ||
			  ($a->{name} cmp $b->{name}) } @$rules) {

    $score = $rule->{score};
			  ($mutable_tests{$b} <=> $mutable_tests{$a}) ||
			   ($a cmp $b)} (keys %scores);
  my $max_hits_per_msg = 0;
  for ($file = 0; $file < $num_tests; $file++) {
    my(@hits) =
     grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (@{$tests_hit{$file}});
    if ((scalar(@hits)+1) > $max_hits_per_msg) {
      $max_hits_per_msg = scalar(@hits)+1;
    }
  }

    # ignored rules (i.e. no scores)
    next unless $score;
    $rule_to_index{$name} = $i;

    # also ignore rules with score range 0
    next if (!$rule->{range_lo} && !$rule->{range_hi});

    # Set an index
    $rule->{index} = $count;
    $index_to_rule[$count] = $rule; # add the reference to the array

    if ($rule->{ismutable}) {
      $mutable++;
      if ($score > $rule->{range_hi}) {
	$score = $rule->{range_hi} - 0.001;
      } elsif ($score < $rule->{range_lo}) {
	$score = $rule->{range_lo} + 0.001;
      }
      #$range_lo{$name} ||= 0.1;
      #$range_hi{$name} ||= 1.5;
    }
    # These should all be set properly if not mutable
    # score = range_lo = range_hi
    else {
      warn "hi != lo for " . $rule->{name} . "!" if $rule->{range_lo} != $rule->{range_hi};
      $score = $rule->{range_hi} = $rule->{range_lo};
    }

    $output .= "." . $count . "\n" .
         "n" . $rule->{name} . "\n" .
	 "b" . $score . "\n" .
	 "m" . $rule->{ismutable} . "\n" .
	 "l" . $rule->{range_lo} . "\n" .
	 "h" . $rule->{range_hi} . "\n";

    $count++;

  }

  # Output this

  open (DAT, ">$opt_o/scores.data");
  print DAT "N$count\n", "M$mutable\n"; # informational
  print DAT $output;
  close DAT;

  open (OUT, ">$opt_o/scores.h");
  print OUT <<EOF;
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
 
int num_scores = $count;
int num_mutable = $mutable;
unsigned char is_mutable[$count];
double range_lo[$count];
double range_hi[$count];
double bestscores[$count];
char *score_names[$count];
double tmp_scores[$count][2];
unsigned char ny_hit[$mutable];
unsigned char yn_hit[$mutable];
 
double lookup[$mutable];
 
/* readscores() is defined in tests.h */
EOF

";
  close OUT;

  writetests_c($max_hits_per_msg); # make sure $rule_to_index is around
}


sub writetests_c {
  my $max_hits_per_msg = $_[0];

  my $max_hits_per_msg = 0;
  my @goodtests;
  my %uniq_logs;
  my $uniq_key;

  my $i = 0;

  # This will "compress" the logs so that one log entry can have a
  # "count" of n indicating it reprents n similar messages
    my $uniq_key = $is_spam{$file} . " ";

  foreach my $log (@$logs) {
     grep {length($_) && (! $ignored_rule{$_}) &&
	    (defined($rule_to_index{$_}))} (@{ $tests_hit{$file} });

    (@goodtests) = grep {exists($_->{index})} (@{$log->{tests_hit}});
    @goodtests = sort {$a <=> $b} map {$_->{index}} @goodtests;

    if($max_hits_per_msg < scalar(@goodtests)) {
      $max_hits_per_msg = scalar(@goodtests);
    }

    $uniq_key = $log->{isspam} ? "s" : "";
    $uniq_key .= join(" ", @goodtests);


    # The %count_keys hash's entries will be the log info for each unique log
    # $log->{count} is increased to indicate similar logs

    if (exists($uniq_logs{$uniq_key})) {
      $uniq_logs{$uniq_key}->{count}++;
    } else {
      $uniq_logs{$uniq_key} = $log;
      $uniq_logs{$uniq_key}->{count} = 1;
      $uniq_files{$file} = scalar(keys(%count_keys)) - 1;
    }

  }

  my $num_nondup = scalar(keys %uniq_logs);

  open TOP, ">$opt_o/tests.h";
  print TOP <<EOF;
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

int num_tests = $num_tests;
int num_nondup = $num_nondup;
int num_spam = $num_spam;
int num_nonspam = $num_ham;
int max_hits_per_msg = $max_hits_per_msg;
unsigned char num_tests_hit[$num_nondup];
unsigned char is_spam[$num_nondup];

double scores[$num_nondup];
double tmp_total[$num_nondup];
int tests_count[$num_nondup];
EOF


  print TOP join('', <DATA>);
  print TOP $_;
  close TOP;

  open (DAT, ">tmp/tests.data");

  open (DAT, ">$opt_o/tests.data");
    print DAT ".".$uniq_files{$file}."\n";

  my $out;
  my $base_score;
  my $num_tests_hit;

  $i = 0;
  foreach my $log (values %uniq_logs) {
    $out = '';
    $base_score = $num_tests_hit = 0;

    print DAT "." . $i . "\n";

    $out .= "s" . ( ($log->{isspam})? 1 : 0 ) . "\n";

    foreach my $test (@{$log->{tests_hit}}) {
      if (!$test->{score}) {
	# Don't really know why this happens, but the old logs-to-c
	#did it too

	warn "ignored rule " . $test->{name} . " got a hit!";
	next;
      }

      if (!$test->{range_lo} && !$test->{range_hi}) {
	# We ignored this rule
	next;
      }

      # debugging...
      if (!defined $test->{index}) {
	warn "test with no index";

      if ($num_tests_hit >= $max_hits_per_msg) {
	die "Need to increase \$max_hits_per_msg";
      }
      } else {
	$base_score += $scores{$test};
      }
    }

      if ($test->{ismutable}) {
	$num_tests_hit++;
	$out .= "t".$test->{index}."\n";

	if ($num_tests_hit >= $max_hits_per_msg) {
	  die "\$max_hits_per_msg not big enough!";
	}
}

      }
      else {
	$base_score += $test->{score};
      }

  # read ranges, and mutableness, from ranges.data.
  open (IN, "<tmp/ranges.data")
  	or die "need to run score-ranges-from-freqs first!";

  my $count = 0;
  while (<IN>) {
    /^(\S+) (\S+) (\d+) (\S+)$/ or next;
    my $t = $4;
    $range_lo{$t} = $1+0;
    $range_hi{$t} = $2+0;
    my $mut = $3+0;

    if ($allrules{$t}->{issubrule}) {
      $ignored_rule{$t} = 1;
      $mutable_tests{$t} = 0;
      next;
    }
    if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) {
      #warn "ignored rule: score and range == 0: $t\n";
      $ignored_rule{$t} = 1;
      $mutable_tests{$t} = 0;
      next;
    }

    $out .= "b" . $base_score . "\n"; # score to add for non-mutable tests
    $out .= "c" . $log->{count} . "\n"; # number of identical logs
    $count++;

    print DAT "n" . $num_tests_hit . "\n" . $out;
      $mutable_tests{$t} = 0;
    } elsif ($range_lo{$t} == $range_hi{$t}) {
      $mutable_tests{$t} = 0;
    } elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
      $mutable_tests{$t} = 0;
    } else {
      $mutable_tests{$t} = 1;
    }
    unless ($mutable_tests{$t} || $scores{$t}) {
      $ignored_rule{$t} = 1;
    }
  }
  close IN;

    $i++;
  foreach my $t (sort keys %allrules) {
    next if (exists($range_lo{$t}));
    if ($allrules{$t}->{issubrule}) {
      $ignored_rule{$t} = 1;
      $mutable_tests{$t} = 0;
      next;
    }
    $ignored_rule{$t} = 0;
    unless (exists($mutable_tests{$t}) &&
	    ($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) {
      $mutable_tests{$t} = 0;
    }
    unless ($mutable_tests{$t} || $scores{$t}) {
      $ignored_rule{$t} = 1;
    }
    $index_to_rule[$count] = $t;
    $count++;
  }
  foreach my $t (keys %range_lo) {
    next if ($ignored_rule{$t});
    if ($mutable_tests{$t}) {
      if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -1;
      } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -0.01;
      }
      if ($scores{$t} >= $range_hi{$t}) {
	$scores{$t} = $range_hi{$t} - 0.001;
      } elsif ($scores{$t} <= $range_lo{$t}) {
	$scores{$t} = $range_lo{$t} + 0.001;
      }
    } else {
      if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
	next;
      } elsif ($range_lo{$t} == $range_hi{$t}) {
	$scores{$t} = $range_lo{$t};
	next;
      }
      if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -1;
      } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -0.01;
      }
      if ($scores{$t} > $range_hi{$t}) {
	$scores{$t} = $range_hi{$t};
      } elsif ($scores{$t} < $range_lo{$t}) {
	$scores{$t} = $range_lo{$t};
      }
    }
  }
}

  close DAT;
   my $fprate = ($ga_ny / $num_tests) * 100.0;
   my $fnrate = ($ga_yn / $num_tests) * 100.0;

   printf ("\n# SUMMARY for threshold %3.1f:\n", $threshold);
   printf "# Correctly non-spam: %6d  %4.2f%%  (%4.2f%% of non-spam corpus)\n", $ga_nn,
       ($ga_nn /  $num_tests) * 100.0, ($ga_nn /  $num_nonspam) * 100.0;
   printf "# Correctly spam:     %6d  %4.2f%%  (%4.2f%% of spam corpus)\n" , $ga_yy,
       ($ga_yy /  $num_tests) * 100.0, ($ga_yy /  $num_spam) * 100.0;
   printf "# False positives:    %6d  %4.2f%%  (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny,
       $fprate, ($ga_ny /  $num_nonspam) * 100.0, $nyscore*$nybias;
   printf "# False negatives:    %6d  %4.2f%%  (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn,
       $fnrate, ($ga_yn /  $num_spam) * 100.0, $ynscore;

  # convert to the TCR metrics used in the published lit
  my $nspamspam = $ga_yy;
  my $nspamlegit = $ga_yn;
  my $nlegitspam = $ga_ny;
  my $nlegitlegit = $ga_yn;
  my $nlegit = $num_nonspam;
  my $nspam = $num_spam;

  my $werr = ($lambda * $nlegitspam + $nspamlegit)
                  / ($lambda * $nlegit + $nspam);

  my $werr_base = $nspam
                  / ($lambda * $nlegit + $nspam);

  $werr ||= 0.000001;     # avoid / by 0
  my $tcr = $werr_base / $werr;

  my $sr = ($nspamspam / $nspam) * 100.0;
  my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
  printf "# TCR: %3.6f  SpamRecall: %3.3f%%  SpamPrec: %3.3f%%  FP: %3.2f%%  FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate;
}

__DATA__

__DATA__
void loadtests (void) {
  FILE *fin = fopen ("tmp/tests.data", "r");
  char buf[256];


  printf ("Read scores for %d tests.\n", num_scores);
}





#!/usr/bin/perl -w
#
# <@LICENSE>
# Copyright 2004 Apache Software Foundation
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

# (rough) graphic demo of this algorithm:
# 0.0  = -limit [......] 0 ........ limit
# 0.25 = -limit ..[..... 0 .]...... limit
# 0.5  = -limit ....[... 0 ...].... limit
# 0.75 = -limit ......[. 0 .....].. limit
# 1.0  = -limit ........ 0 [......] limit
my $sliding_window_limits = 4.8; # limits = [-$range, +$range]
my $sliding_window_size =   5.5; # scores have this range within limits

# 0.0  = -limit [......] 0 ........ limit
# 0.25 = -limit ....[... 0 ]....... limit
# 0.5  = -limit ......[. 0 .]...... limit (note: tighter)
# 0.75 = -limit .......[ 0 ...].... limit
# 1.0  = -limit ........ 0 [......] limit
my $shrinking_window_lower_base =   0.00; 
my $shrinking_window_lower_range =  1.00; # *ratio, added to above
my $shrinking_window_size_base =    1.00;
my $shrinking_window_size_range =   1.00; # *ratio, added to above

my $use_sliding_window = 0;

my $argcffile = shift @ARGV;
my $scoreset = shift @ARGV;
$scoreset = 0 if ( !defined $scoreset );

if (defined ($argcffile) && $argcffile eq '-test') {
  # use this to debug the ranking -> score-range mapping:
  for $rat (0.0, 0.25, 0.5, 0.75, 1.0) {
    my ($lo, $hi); if ($use_sliding_window) {
      ($lo, $hi) = sliding_window_ratio_to_range($rat);
    } else {
      ($lo, $hi) = shrinking_window_ratio_to_range($rat);
    }
    warn "test: $rat => [ $lo $hi ]\n";
  } exit;
}

my %freq_spam = ();
my %freq_nonspam = ();

my $num_spam;
my $num_nonspam;
my $num_total;

my %mutable_tests = ();
my %ranking = ();
my %soratio = ();
my %is_nice = ();

if (!defined $argcffile) { $argcffile = "../rules"; }
system ("./parse-rules-for-masses -d \"$argcffile\" -s $scoreset") and die;
if (-e "tmp/rules.pl") {
  # Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem
  require "./tmp/rules.pl";
}
else {
  die "parse-rules-for-masses had no error but no tmp/rules.pl!?!";
}

while (<>) {
  /^\s*([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+\S+\s+(.+)\s*$/ or next;

  my $overall = $1+0;
  my $spam = $2+0;
  my $nonspam = $3+0;
  my $soratio = $4+0;
  my $ranking = $5+0;
  my $test = $6;

  if ($test eq '(all messages)') {
    $num_spam = $spam;
    $num_nonspam = $nonspam;
    $num_total = $spam+$nonspam;
    next;
  }
  next if ($test eq '(all messages as %)');

  if (!defined ($rules{$test})) {
    warn "rule $test no longer exists; ignoring\n";
    next;
  }

  $freq{$test} = $overall;
  $freq_spam{$test} = $spam;
  $freq_nonspam{$test} = $nonspam;

  my $tflags = $rules{$test}->{tflags}; $tflags ||= '';
  if ($tflags =~ /\buserconf\b/ ||
      ( ($scoreset % 2) == 0 && $tflags =~ /\bnet\b/ )) {
    $mutable_tests{$test} = 0;
  } else {
    $mutable_tests{$test} = 1;
  }
  if ($tflags =~ m/\bnice\b/i) {
    $is_nice{$test} = 1;
  } else {
    $is_nice{$test} = 0;
  }

  if ($overall < 0.01) {        # less than 0.01% of messages were hit
    $mutable_tests{$test} = 0;
    $soratio{$test} = 0.5;
    $ranking{$test} = 0.0;
    $rules{$test}->{score} = 0; # tvd - disable these rules automagically

  } else {
    $soratio{$test} = $soratio;
    $ranking{$test} = $ranking;
  }
}

if ( ! mkdir "tmp", 0755 ) {
  warn "Couldn't create tmp directory!: $!\n";
}

open (OUT, ">tmp/ranges.data");
foreach my $test (sort { $ranking{$b} <=> $ranking{$a} } keys %freq) {
  if (!defined ($rules{$test})) {
    warn "no rule $test";
    print OUT ("0 0 0 $test\n");
    next;
  }

  my $overall = $freq{$test};
  my $spam = $freq_spam{$test};
  my $nonspam = $freq_nonspam{$test};
  my $soratio = $soratio{$test};
  my $ranking = $ranking{$test};
  my $mutable = $mutable_tests{$test};

  if (!$mutable || $rules{$test}->{score} == 0) { # didn't look for score 0 - tvd
    printf OUT ("%3.3f %3.3f 0 $test\n",
                         $rules{$test}->{score},
                         $rules{$test}->{score});
    next;
  }

  # 0.0 = best nice, 1.0 = best nonnice
  if ($is_nice{$test}) {
    $ranking = .5 - ($ranking / 2);
  } else {
    $ranking = .5 + ($ranking / 2);
  }

  my ($lo, $hi);
  if ($use_sliding_window) {
    ($lo, $hi) = sliding_window_ratio_to_range($ranking);
  } else {
    ($lo, $hi) = shrinking_window_ratio_to_range($ranking);
  }

  # tvd
  my $tflags = $rules{$test}->{tflags}; $tflags ||= '';
  if ( $is_nice{$test} && ( $ranking < .5 ) ) { # proper nice rule
    if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score # -5.4
      $lo *=1.8;
    }
    elsif ($soratio <= 0.05 && $nonspam > 0.5) { # let good rules be larger if they want to, -4.5
      $lo *= 1.5;
    }

    $hi =	($soratio == 0) ? $lo :
    		($soratio <= 0.005 ) ? $lo/1.1 :
    		($soratio <= 0.010 && $nonspam > 0.2) ? $lo/2.0 :
		($soratio <= 0.025 && $nonspam > 1.5) ? $lo/10.0 :
		0;

    if ( $soratio >= 0.35 ) { # auto-disable bad rules
      ($lo,$hi) = (0,0);
    }
  }
  elsif ( !$is_nice{$test} && ( $ranking >= .5 ) ) { # proper spam rule
    if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score
      $hi *=1.8;
    }
    elsif ( $soratio >= 0.99 && $spam > 1.0 ) {
      $hi *= 1.5; # let good rules be larger if they want to
    }

    $lo =	($soratio == 1) ? $hi:
    		($soratio >= 0.995 ) ? $hi/4.0 :
    		($soratio >= 0.990 && $spam > 1.0) ? $hi/8.0 :
		($soratio >= 0.900 && $spam > 10.0) ? $hi/24.0 :
		0;

    if ( $soratio <= 0.65 ) { # auto-disable bad rules
      ($lo,$hi) = (0,0);
    }
  }
  else { # rule that has bad nice setting
    ($lo,$hi) = (0,0);
  }
  $mutable = 0 if ( $hi == $lo );

  printf OUT ("%3.1f %3.1f $mutable $test\n", $lo, $hi);
}
close OUT;
exit;

sub sliding_window_ratio_to_range {
  my $ratio = shift;
  my $lo = -$sliding_window_limits + ($sliding_window_size * $ratio);
  my $hi = +$sliding_window_limits - ($sliding_window_size * (1-$ratio));
  if ($lo > $hi) { # ???
    ($lo,$hi) = ($hi,$lo);
  }
  ($lo, $hi);
}

sub shrinking_window_ratio_to_range {
  my $ratio = shift;
  my $is_nice = 0;
  my $adjusted = ($ratio -.5) * 2;      # adj [0,1] to [-1,1]
  if ($adjusted < 0) { $is_nice = 1; $adjusted = -$adjusted; }

#$adjusted /= 1.5 if ( $ratio < 0.95 && $ratio > 0.15 ); # tvd

  my $lower = $shrinking_window_lower_base 
                        + ($shrinking_window_lower_range * $adjusted);
  my $range = $shrinking_window_size_base 
                        + ($shrinking_window_size_range * $adjusted);
  my $lo = $lower;
  my $hi = $lower + $range;
  if ($is_nice) {
    my $tmp = $hi; $hi = -$lo; $lo = -$tmp;
  }
  if ($lo > $hi) { # ???
    ($lo,$hi) = ($hi,$lo);
  }

  ($lo, $hi);
}





# limitations under the License.
# </@LICENSE>

use Getopt::Std;
getopts("l:L:h");

use FindBin;
use lib "$FindBin::Bin/../lib";
use Mail::SpamAssassin::Masses;
use Getopt::Long qw(:config bundling auto_help);
use Pod::Usage;
use strict;
use warnings;

use vars qw {
$opt_c $opt_s $opt_l $opt_L $opt_inclang
};

GetOptions("c|cffile=s@" => \$opt_c,
           "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
           "l|logfile=s" => \$opt_l,
           "L|language=s" => \$opt_L,
           "include-language=s" => \$opt_inclang);

    -l LC  also print language specific rules for lang code LC (or 'all')
    -L LC  only print language specific rules for lang code LC (or 'all')

    options -l and -L are mutually exclusive.

my $lower = 1;
#$threshold = 5;
my $higher = 9;
my $min_expected = 2; # Should not be set to more than 5 or less than 2


=head1 NAME
 
find-extremes - Determine which rules are most likely to cause false positives/negatives.
 
=head1 SYNOPSIS
 
hit-frequencies [options]
 
 Options:
    -c,--cffile=path      Use path as the rules directory
    -s,--scoreset=n       Use scoreset n
    -l,--logfile=file     Read in file instead of masses.log
    -L,--language=lc      Only print language specific tests for specified lang code (try 'all')
    --include-language=lc Also print language specific tests for specified lang code (try 'all')
 
=head1 DESCRIPTION

B<hit-frequencies> will read the mass-check log F<masses.log> or the
log given by the B<--logfile> option. By default, B<hit-frequencies>
will assume the proper values for B<--cffile> based on the header of
the masses.log. The output will include the following columns:

=over 4

=item RULE

=item CHISQUARE

=item RATIO_FALSEPOS

=item OVER_FALSEPOS

=item FREQ_OVER

=back

=head1 BUGS

This script may or may not work as designed - it probably needs some
tweaking, and I probably introduced a bug into it while re-writing for
the new Masses stuff. 

=head1 NOTES

This script is poorly documented. Patches welcome.

=cut


$opt_s = 0 unless defined $opt_s;

my $ok_lang = lc ( $opt_inclang || $opt_L || '');
$ok_lang = '.' if ($ok_lang eq 'all');

my $greprules = sub {
  my ($name, $rule) = @_;

  return 0 if (($opt_L && !$rule->{lang}) ||
           ($rule->{lang} &&
            (!$ok_lang || $rule->{lang} !~ /^$ok_lang/i))); # Wrong language

  return 0 if ($rule->{tflags} =~ /\bnet\b/);

  return 1;

};

$opt_l ||= "masses.log";

if (!$opt_c || !scalar(@$opt_c)) {
    # Try to read this in from the log, if possible
    open (IN, $opt_l) or die "Can't open $opt_l: $!";
    my $files = 0; # are we in the files section?
    while(<IN>) {
        if (!$files) {
            if (/^\# SVN revision:/) {
                $opt_c = [ "$FindBin::Bin/../rules" ];
                last;
            } elsif (/^\# Using configuration:$/) {
                $files = 1;
            }
        } elsif (/^\#\s+(.*)\s*$/) {
            push (@$opt_c, $1);
        } else {
            # All done!
            last;
        }
    }

    foreach my $file (@$opt_c) {
        die "Can't read $file" unless -r $file;
    }
}

my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
                                               scoreset => $opt_s,
                                               greprules => $greprules,
                                               logfile => $opt_l,
                                               nologs => 1});

$masses->readrules();
$masses->readlogs();
$higher = 9;
$min_expected = 2; # Should not be set to more than 5 or less than 2

my $rules = $masses->get_rules_hash();
my $logs = $masses->get_logs();

my $num_spam = $masses->get_num_spam();
my $num_ham = $masses->get_num_ham();

my %freq_over_higher_falsepos = (); # how often non-nice found in ones over
                                    # higher threshold that are false positives
my %freq_nonspam = ();	# how often nice found in nonspam
my %freq_under_lower_falseneg = (); # how often nice found in ones under
                                    # lower threshold that are false negatives


my %ratio_expected_falsepos = (); # ratio version of above
my %ratio_expected_falseneg = (); # ditto

my $num_spam = 0;
my $num_nonspam = 0;
my $num_over_higher_falsepos = 0;
my $num_under_lower_falseneg = 0;
my $ok_lang = '';

my %chisquare = ( );
my %prob = ( );

$ok_lang = lc ($opt_l || $opt_L || '');
if ($ok_lang eq 'all') { $ok_lang = '.'; }

foreach my $key (keys %$rules) {

  if ($rules->{$key}->{tflags} !~ /\buserconf\b/) {
    if ($rules->{$key}->{tflags} =~ m/nice/) {
         (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i)
     ) ) {
    delete $rules{$key} ; next;
  }

  if ($rules{$key}->{tflags} =~ m/net/) {
    delete $rules{$key};
    next;
  }
  if ($rules{$key}->{tflags} !~ m/userconf/) {
    if ($rules{$key}->{tflags} =~ m/nice/) {
      $freq_nonspam{$key} = 0;
      $freq_under_lower_falseneg{$key} = 0;
    } else {
      $freq_spam{$key} = 0;
      $freq_over_higher_falsepos{$key} = 0;
    }
  }

}

foreach my $log (@$logs) {

  if($log->{isspam}) {
    # Also need to count plus_hits
    my $plus_hits = 0;
    foreach my $test (@{$log->{tests_hit}}) {
      $plus_hits += $test->{score} if ($test->{score} > 0);
    }

    if(($log->{score} <= $lower) && $plus_hits && $plus_hits >= $lower) {
      $num_under_lower_falseneg++;
      foreach my $test (@{$log->{tests_hit}}) {
	$num_under_lower_falseneg++;
	$freq_under_lower_falseneg{$test->{name}}++ if exists $freq_under_lower_falseneg{$test->{name}};
      }
    }
  }
  else {
    if($log->{score} > $higher) {
      $num_over_higher_falsepos++;
      foreach my $test (@{$log->{tests_hit}}) {
	$num_over_higher_falsepos++;
	$freq_over_higher_falsepos{$test->{name}}++ if exists $freq_over_higher_falsepos{$test->{name}};
      }
    }
  }

}

unless (($num_over_higher_falsepos >= $min_expected)
	&& ($num_under_lower_falseneg >= $min_expected)) {
  die "Insufficient extremes in dataset (" . $num_over_higher_falsepos .

}

my $ratio_falsepos = $num_over_higher_falsepos/$num_spam;
my $ratio_falseneg = $num_under_lower_falseneg/$num_ham;

my $skipped_non_nice = 0;

# non-nice rules
foreach my $rule (keys %freq_over_higher_falsepos) {
  my $expected = $rules->{$rule}->{freq_spam}*$ratio_falsepos;
  if ($expected <= $min_expected) {
    $skipped_non_nice++;
    next;

   $freq_over_higher_falsepos{$rule}/$expected;
  ($chisquare{$rule},$prob{$rule}) =
   chisquare($num_spam,$num_over_higher_falsepos,
	     $rules->{$rule}->{freq_spam},$freq_over_higher_falsepos{$rule});
  if ($freq_over_higher_falsepos{$rule} < $expected) {
    $chisquare{$rule} *= -1;
  }


my $skipped_nice = 0;

# nice rules
foreach my $rule (keys %freq_under_lower_falseneg) {
  my $expected = $rules->{$rule}->{freq_ham}*$ratio_falseneg;
  if ($expected <= $min_expected) {
    $skipped_nice++;
    next;

  $ratio_expected_falseneg{$rule} =
   $freq_under_lower_falseneg{$rule}/$expected;
  ($chisquare{$rule},$prob{$rule}) =
   chisquare($num_ham,$num_under_lower_falseneg,
	     $rules->{$rule}->{freq_ham},$freq_under_lower_falseneg{$rule});
  if ($freq_under_lower_falseneg{$rule} < $expected) {
    $chisquare{$rule} *= -1;
  }


warn "Skipped nice: $skipped_nice\n";

# The rest is copied verbatim from before - its complicated and not
# commented and should work unchanged except for the freq_spam and
# freq_ham stuff and fixing some use strict stuff

my @rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos);

if (scalar(@rules_falsepos)) {
  print "RULE\t\tCHISQUARE\tRATIO_FALSEPOS\tOVER_FALSEPOS\tFREQ_OVER ($num_over_higher_falsepos)\n";
  my(@rules_falsepos_bad) =

	   $over_expected_falsepos{$a}) ||
	    ($freq_over_higher_falsepos{$b} <=>
	     $freq_over_higher_falsepos{$a})} (@rules_falsepos_bad);
    foreach my $rule (@rules_falsepos_bad) {
      print $rule . "\t" . $prob{$rule} . "\t" .
       $ratio_expected_falsepos{$rule} . "\t" .
	$over_expected_falsepos{$rule} . "\t" .

       ($chisquare{$a} <=> $chisquare{$b}) ||
	($ratio_expected_falsepos{$a} <=>
	 $ratio_expected_falsepos{$b}) ||
	  ($rules->{$b}->{freq_spam} <=>
	   $rules->{$a}->{freq_spam})} (@rules_falsepos_good);
    foreach my $rule (@rules_falsepos_good) {
      print $rule . "\t" . $prob{$rule} . "\t" .
       $ratio_expected_falsepos{$rule} . "\t" .
	$over_expected_falsepos{$rule} . "\t" .

  warn "No over-falsepos to print\n";
}

my @rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg);

if (scalar(@rules_falseneg)) {
  print "RULE\t\tCHISQUARE\tRATIO_FALSENEG\tOVER_FALSENEG\tFREQ_UNDER ($num_under_lower_falseneg)\n";

	   $over_expected_falseneg{$a}) ||
	    ($freq_under_lower_falseneg{$b} <=>
	     $freq_under_lower_falseneg{$a})} (@rules_falseneg_bad);
    foreach my $rule (@rules_falseneg_bad) {
      print $rule . "\t" . $prob{$rule} . "\t" .
       $ratio_expected_falseneg{$rule} . "\t" .
	$over_expected_falseneg{$rule} . "\t" .

       ($chisquare{$a} <=> $chisquare{$b}) ||
	($ratio_expected_falseneg{$a} <=>
	 $ratio_expected_falseneg{$b}) ||
	  ($rules->{$b}->{freq_ham} <=>
	   $rules->{$a}->{freq_ham})} (@rules_falseneg_good);
    foreach my $rule (@rules_falseneg_good) {
      print $rule . "\t" . $prob{$rule} . "\t" .
       $ratio_expected_falseneg{$rule} . "\t" .
	$over_expected_falseneg{$rule} . "\t" .

}

exit;

sub readlogs {
  my $spam = $ARGV[0] || "spam.log";
  my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log");


  (open(NONSPAM,$nonspam)) ||
   (die "Couldn't open file '$nonspam': $!; stopped");

  while (defined($line = <NONSPAM>)) {
    if ($line =~ m/^\s*\#/) {
      next;
    } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) {
      my $tests = $1;
      my $hits = 0;
      my(@tests) = ();
      foreach $test (grep {length($_)} (split(/,+/,$tests))) {
	if (exists($rules{$test})) {
	  push @tests, $test;
	  $hits += $rules{$test}->{score};
	}
      }
      
      if (scalar(@tests)) {
	$num_nonspam++;
	foreach $test (grep {exists($freq_nonspam{$_})} (@tests)) {
	  $freq_nonspam{$test}++;
	}
	if ($hits >= $higher) {
	  $num_over_higher_falsepos++;
	  foreach $test (grep
			 {exists($freq_over_higher_falsepos{$_})} (@tests)) {
	    $freq_over_higher_falsepos{$test}++;
	  }
	}
      }
    } elsif ($line =~ m/\S/) {
      chomp($line);
      warn "Can't interpret line '$line'; skipping";
    }
  }

  close(NONSPAM);

  (open(SPAM,$spam)) || (die "Couldn't open file '$spam': $!; stopped");

  while (defined($line = <SPAM>)) {
    if ($line =~ m/^\s*\#/) {
      next;
    } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) {
      my $tests = $1;
      my $hits = 0;
      my $plus_hits = 0;
      my(@tests) = ();
      foreach $test (grep {length($_)} (split(/,+/,$tests))) {
	if (exists($rules{$test})) {
	  push @tests, $test;
	  $hits += $rules{$test}->{score};
	  if ($rules{$test}->{score} > 0) {
	    $plus_hits += $rules{$test}->{score};
	  }
	}
      }
      
      if (scalar(@tests)) {
	$num_spam++;
	foreach $test (grep {exists($freq_spam{$_})} (@tests)) {
	  $freq_spam{$test}++;
	}
	if (($hits <= $lower) && $plus_hits &&
	    ($plus_hits >= $lower)) {
	  $num_under_lower_falseneg++;
	  foreach $test (grep
			 {exists($freq_under_lower_falseneg{$_})} (@tests)) {
	    $freq_under_lower_falseneg{$test}++;
	  }
	}
      }
    } elsif ($line =~ m/\S/) {
      chomp($line);
      warn "Can't interpret line '$line'; skipping";
    }
  }

  close(SPAM);
}


sub readscores {
  system ("./parse-rules-for-masses") and
   die "Couldn't do parse-rules-for-masses: $?; stopped";
  require "./tmp/rules.pl";
}


Return to bug 2853

Line 0 Link Here

(-)lib/Mail/SpamAssassin/Masses.pm (+833 lines)
		1	# <@LICENSE>
		2	# Copyright 2004 Apache Software Foundation
		3	#
		4	# Licensed under the Apache License, Version 2.0 (the "License");
		5	# you may not use this file except in compliance with the License.
		6	# You may obtain a copy of the License at
		7	#
		8	# http://www.apache.org/licenses/LICENSE-2.0
		9	#
		10	# Unless required by applicable law or agreed to in writing, software
		11	# distributed under the License is distributed on an "AS IS" BASIS,
		12	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		13	# See the License for the specific language governing permissions and
		14	# limitations under the License.
		15	# </@LICENSE>
		16
		17	=head1 NAME
		18
		19	Mail::SpamAssassin::Masses - Interface for reading and parsing rules
		20	and mass-check logs for SpamAssassin
		21
		22	=head1 SYNOPSIS
		23
		24	my $parser = Mail::SpamAssassin::Masses->new();
		25	my $rules = $parser->readrules();
		26	my $logs = $parser->readlogs();
		27
		28	foreach my $test (keys %$rules) {
		29	if ($rules->{$test}->{score} > 1) {
		30	...
		31	}
		32
		33	=head1 DESCRIPTION
		34
		35	Mail::SpamAssassin::Masses is a module to simplify the many scripts
		36	that used to make up the SpamAssassin re-scoring process. By
		37	consolidating all the shared code in one module, the scripts can be
		38	simplified and require fewer temporary files.
		39
		40	=head1 METHODS
		41
		42	=over 4
		43
		44	=cut
		45
		46	package Mail::SpamAssassin::Masses;
		47
		48	use strict;
		49	use warnings;
		50	use Carp;
		51
		52	=item $parser = Mail::SpamAssassin::Masses->new( [ { opt => val, ... } ] );
		53
		54	Construct a new Mail::SpamAssassin::Masses object. You may pass the
		55	following attribute-value pairs to the constructor.
		56
		57	=over 4
		58
		59	=item rulesdir
		60
		61	The directory containing rules. If multiple directories are desired,
		62	an anonymous array should be passed.
		63
		64	=item scoreset
65
66	Scoreset to deal with.
67
68	=item logfile
69
70	Filename of mass-check log.
71
72	=item falses
73
74	Also count frequencies for false positives and false negatives from
75	the logs.
76
77	=item falsesonly
78
79	Only count false positives and false negatives.
80
81	=item greprule
82
83	Coderef that is passed a rule name and a hash ref with the entries
84	containing info about the rule. If the sub returns false, it is skipped.
85
86	=item greplog
87
88	Coderef that is passed a raw log entry. If it returns false, the entry
89	is skipped.
90
91	=item sliding_window
92
93	Use a sliding window for score ranges rather than a shrinking window.
94
95	=item nologs
96
97	Save memory by not saving the individual log results, just the
98	aggregate totals
99
100	=back
101
102	=cut
103
104	sub new {
105
106	my $class = shift;
107	$class = ref($class) \|\| $class;
108
109	my $self = shift;
110	if (!defined $self){
111	$self = { };
112	}
113
114	$self->{scoreset} \|\|= 0;
115	$self->{rulesdir} \|\|= '';
116	$self->{logfile} \|\|= "masses.log";
117
118	bless($self, $class);
119
120	return $self;
121
122	}
123
124	=item $parser->readrules()
125
126	Read and parse the rules from the directory specified as
127	C<rulesdir>. This loads the following keys and values into the hash
128	entry representing the rules (see below).
129
130	=over 4
131
132	=item name
133
134	Contains the rule's name.
135
136	=item score
137
138	Contains the rule's score.
139
140	=item type
141
142	Contains the rule's type (header, body, uri, etc.)
143
144	=item tflags
145
146	Contains the rules tflags (nice, autolearn, etc.) as specified in the config file.
147
148	=item lang
149
150	Set to the value of C<lang> for language-specific tests.
151
152	=item issubrule
153
154	Set to true if the rules is a sub-rule, (i.e. it starts with
155	__). Otherwise, undefined.
156
157	=item isnice
158
159	This key exists and is true if the rule is nice (i.e. with a score
160	that can be below zero).
161
162	=item describe
163
164	Set to the rule's description, in English, or in the rule's language.
165
166	=back
167
168	There may be more values once C<readlogs()> is run.
169
170	=cut
171
172
173	sub readrules {
174
175	my $self = shift;
176
177	$self->{rules} \|\|= { };
178	my $rules = $self->{rules}; # $rules is a reference to the anon hash
179
180	my @dirs = ref($self->{rulesdir}) ? @{$self->{rulesdir}} : $self->{rulesdir};
181
182	foreach my $indir (@dirs) {
183	my @files = glob("$indir/*.cf"); # no reason to only do numbered files
184
185	foreach my $file (@files) {
186	open (IN, "<$file") \|\| croak("Can't open $file, $!");
187	while(<IN>) {
188	s/#.*$//g;
189	s/^\s+//;
190	s/\s+$//;
191	next if /^$/;
192
193	my $lang = '';
194	if (s/^lang\s+(\S+)\s+//) {
195	$lang = lc $1;
196	}
197
198	if (/^(header\|rawbody\|body\|full\|uri\|meta)\s+(\S+)\s+/) {
199	my $type = $1;
200	my $name = $2;
201
202	$rules->{$name} \|\|= { };
203	$rules->{$name}->{name} = $name;
204	$rules->{$name}->{type} = $type;
205	$rules->{$name}->{lang} = $lang if $lang;
206	$rules->{$name}->{tflags} = '';
207
208	if ($name =~ /^__/) {
209	$rules->{$name}->{issubrule} = '1';
210	}
211
212	} elsif (/^describe\s+(\S+)\s+(.+)$/) {
213
214	# Let's get description in english, por favor -- unless the rule isn't english
215
216	next if ($lang && (!$rules->{$1}->{lang} \|\| $rules->{$1}->{lang} ne $lang));
217
218	$rules->{$1} \|\|= { };
219	$rules->{$1}->{describe} = $2;
220
221	} elsif (/^tflags\s+(\S+)\s+(.+)$/) {
222	my $name = $1;
223	$rules->{$name} \|\|= { };
224	$rules->{$name}->{tflags} = $2;
225	if ($2 =~ /nice/) {
226	$rules->{$name}->{isnice} = 1;
227	}
228	} elsif (/^score\s+(\S+)\s+(.+)$/) {
229	my($name,$score) = ($1,$2);
230	$rules->{$name} \|\|= { };
231	if ( $score =~ /\s/ ) { # there are multiple scores
232	($score) = (split(/\s+/,$score))[$self->{scoreset}];
233	}
234	$rules->{$name}->{score} = $score;
235	}
236	}
237	close IN;
238	}
239	}
240	foreach my $rule (keys %{$rules}) {
241	if (!defined $rules->{$rule}->{type}) {
242	delete $rules->{$rule}; # no rule definition -> no rule
243	next;
244	}
245
246	if (!defined $rules->{$rule}->{score}) {
247	my $def = 1.0;
248	if ($rule =~ /^T_/) { $def = 0.01; }
249
250	if ($rules->{$rule}->{isnice}) {
251	$rules->{$rule}->{score} = -$def;
252	} else {
253	$rules->{$rule}->{score} = $def;
254	}
255	}
256
257	if ($self->{greprules} && !&{$self->{greprules}}($rule, $rules->{$rule}))
258	{
259	delete $rules->{$rule};
260	next;
261	}
262
263	}
264
265	$self->{_readrules} = 1;
266	}
267
268	=item $parser->readlogs()
269
270	Read and parse logs from C<logsdir>. This will create the anonymous
271	array of hashes referred to by C<$parser->{logs}>, with the following
272	keys:
273
274	=over 4
275
276	=item isspam
277
278	True if the message is spam. False or undefined otherwise.
279
280	=item isfalse
281
282	True if the message was a false negative or positive.
283
284	=item tests_hit
285
286	Array reference containing references to the hash representing each
287	rule hit.
288
289	=item score
290
291	Score the message received (under current scores).
292
293	=back
294
295	In addition, this method adds the following keys to the rule
296	information in C<$parser->{rules}>.
297
298	=over 4
299
300	=item freq_spam
301
302	Frequency hit in spam.
303
304	=item freq_ham
305
306	Frequency hit in ham.
307
308	=item freq_fp
309
310	Frequency in false positives.
311
312	=item freq_fn
313
314	Frequency in false negatives.
315
316	=back
317
318	Also, sets C<$parser->{num_spam}> and C<$parser->{num_ham}> to the number of
319	spam logs read and the number of ham logs read, respectively.
320
321	=cut
322
323	sub readlogs {
324
325	my $self = shift;
326
327	if (!$self->{_readrules}) {
328	# need to read scores first!
329	$self->readrules();
330	}
331
332	my $rules = $self->{rules}; # copy the ref, shorthand
333
334	my $logs;
335	if (! $self->{nologs}) {
336	$self->{logs} \|\|= [ ];
337	$logs = $self->{logs};
338	}
339
340
341	my ($num_spam, $num_ham, $count, $num_fp, $num_fn);
342	$num_spam = $num_ham = $count = $num_fp = $num_fn = 0;
343
344	# First, initialize stuff
345	foreach my $rule (values %{$self->{rules}}) {
346	$rule->{freq_spam} \|\|= 0;
347	$rule->{freq_ham} \|\|= 0;
348
349	if($self->{falses}) {
350	$rule->{freq_fp} \|\|= 0;
351	$rule->{freq_fn} \|\|= 0;
352	}
353
354	}
355
356	my $file = $self->{logfile};
357	open (IN, "<$file");
358
359	while (<IN>) {
360	next if /^\#/;
361	next if /^$/;
362	if($_ !~ /^(.)\s+(.)\s+-?[\d.]+\s+\S+(\s+\S+\s+)/) { warn "bad line: $_"; next; }
363
364	if ($self->{greplogs} && !&{$self->{greplogs}}($_)) {
365	next;
366	}
367
368	my $manual = $1;
369	my $result = $2;
370	$_ = $3;
371	s/(?:bayes\|time)=\S+//;
372	s/,,+/,/g;
373	s/^\s+//;
374	s/\s+$//;
375
376
377	if ($manual ne $result) {
378	$self->{isfalse} = 1;
379	}
380	elsif ($self->{falsesonly}) {
381	next;
382	}
383
384	if ($manual eq "s") {
385	$num_spam++;
386	$logs->[$count]->{isspam} = 1 unless $self->{nologs};
387	$num_fn++ if $result eq "h";
388	} else {
389	$num_ham++;
390	$num_fp++ if $result eq "s";
391	}
392
393	my @tests = ();
394	my $score = 0;
395	foreach my $tst (split (/,/, $_)) {
396	next if ($tst eq '');
397
398	# Don't count non-existant rules
399	# (Could happen with greprules)
400	next if ( !$rules->{$tst} \|\| !$rules->{$tst}->{type} );
401
402	if ($manual eq "s") {
403	$rules->{$tst}->{freq_spam}++;
404	$rules->{$tst}->{freq_fn}++ if ($self->{falses} && $result eq "h");
405	}
406	else {
407	$rules->{$tst}->{freq_ham}++;
408	$rules->{$tst}->{freq_fp}++ if ($self->{falses} && $result eq "s");
409	}
410
411	$score += $rules->{$tst}->{score};
412
413	push (@tests, $rules->{$tst}) unless $self->{nologs};
414	}
415
416	$logs->[$count]->{tests_hit} = \@tests unless $self->{nologs};
417	$logs->[$count]->{score} = $score;
418
419	$count++;
420	}
421	close IN;
422
423	$self->{num_spam} = $num_spam;
424	$self->{num_ham} = $num_ham;
425	if ($self->{falses}) {
426	$self->{num_fn} = $num_fn;
427	$self->{num_fp} = $num_fp;
428	}
429
430	$self->{_readlogs} = 1; # Done reading logs
431
432	}
433
434	=item $parser->do_statistics();
435
436	Calculate the S/O ratio and the rank for each test.
437
438	This adds the following keys to the rules hashes.
439
440	=over 4
441
442	=item spam_percent
443
444	Percentage of spam messages hit.
445
446	=item ham_percent
447
448	Percentage of ham messages hit.
449
450	=item soratio
451
452	S/O ratio -- percentage of spam messages hit divided by total
453	percentage of messages hit.
454
455	=back
456
457	=cut
458
459	sub do_statistics {
460	my $self = shift;
461
462	if (! $self->{_readlogs} ) {
463	$self->readlogs();
464	}
465
466	my $rank_hi=0;
467	my $rank_lo=999999;
468
469	foreach my $rule (values %{$self->{rules}}) {
470
471	if (!$rule->{freq_spam}) {
472	$rule->{spam_percent} = 0;
473	} else {
474	$rule->{spam_percent} = $rule->{freq_spam} / $self->{num_spam} * 100.0;
475	}
476
477	if (!$rule->{freq_ham}) {
478	$rule->{ham_percent} = 0;
479	} else {
480	$rule->{ham_percent} = $rule->{freq_ham} / $self->{num_ham} * 100.0;
481	}
482
483	if (!$rule->{freq_spam} && !$rule->{freq_ham}) {
484	$rule->{soratio} = 0.5;
485	next;
486	}
487
488	$rule->{soratio} = $rule->{spam_percent} / ($rule->{spam_percent} + $rule->{ham_percent});
489
490	}
491
492	$self->{_statistics} = 1;
493
494	}
495
496	=item $parser->do_rank();
497
498	Calculates the ranking for each rule and stores this in the
499	appropriate key.
500
501	=over 4
502
503	=item rank
504
505	"Rank" of the rule. High numbers are good, low are bad.
506
507	=back
508
509	=cut
510
511	sub do_rank {
512
513	my $self = shift;
514
515	if (! $self->{_statistics} ) {
516	$self->do_statistics();
517	}
518
519	my $rank_hi = 0;
520	my $rank_lo = 9999999;
521
522	my %unwanted;
523	my %wanted;
524	my %wranks;
525	my %uranks;
526	my $rules = $self->{rules};
527
528
529	foreach my $rule (values %{$self->{rules}}) {
530
531	$wanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_ham} : $rule->{freq_spam};
532	$unwanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_spam} : $rule->{freq_ham};
533
534	$wranks{$wanted{$rule->{name}}} = 1;
535	$uranks{$unwanted{$rule->{name}}} = 1;
536
537	}
538
539	my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
540	my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;
541
542	# first half of ranking is the wanted rank
543	my $position = 0;
544	my $last = undef;
545
546	foreach my $test (@wanted) {
547	$position++ if defined $last && $last != $wanted{$test};
548	$rules->{$test}->{rank} += $position;
549	$last = $wanted{$test};
550	}
551
552	# second half is the unwanted rank
553	$position = 0;
554	$last = undef;
555	my $normalize = (scalar keys %wranks) / (scalar keys %uranks);
556
557	foreach my $test (@unwanted) {
558	$position++ if defined $last && $last != $unwanted{$test};
559	$rules->{$test}->{rank} += ($position * $normalize);
560	$last = $unwanted{$test};
561	$rank_hi = $rules->{$test}->{rank} if ($rules->{$test}->{rank} > $rank_hi);
562	$rank_lo = $rules->{$test}->{rank} if ($rules->{$test}->{rank} < $rank_lo);
563	}
564
565	$rank_hi = $rank_hi - $rank_lo;
566	foreach my $rule (values %{$rules}) {
567	$rule->{rank} = ($rank_hi == 0) ? 0.001 : (($rule->{rank} - $rank_lo)/ $rank_hi);
568	}
569
570	$self->{_rank} = 1;
571	}
572
573	=item $parser->get_rules_array();
574
575	Returns a reference to an array of hash references. The values of
576	these hash have keys as listed above.
577
578	=cut
579
580	sub get_rules_array {
581	my $self = shift;
582	return [ values %{$self->{rules}} ];
583	}
584
585	=item $parser->get_rules_hash();
586
587	Returns a reference to a hash with rule names as keys and hash
588	references as values. The values of these hash have keys as listed
589	above.
590
591	=cut
592
593	sub get_rules_hash {
594	my $self = shift;
595	return $self->{rules};
596	}
597
598	=item $parser->get_logs();
599
600	Returns a reference to the array containing log entries, in the form
601	of anonymous hashes with keys as described above.
602
603	=cut
604
605	sub get_logs {
606	my $self = shift;
607	return $self->{logs};
608	}
609
610	=item $parser->get_num_ham();
611
612	Returns number of ham logs read.
613
614	=cut
615
616	sub get_num_ham {
617	my $self = shift;
618	return $self->{num_ham};
619	}
620
621	=item $parser->get_num_spam();
622
623	Returns number of spam logs read.
624
625	=cut
626
627	sub get_num_spam {
628	my $self = shift;
629	return $self->{num_spam};
630	}
631
632	=item $parser->do_score_ranges();
633
634	Figure out range in which score can be set based on the soratio, etc.
635
636	This is necessary so that the perceptron doesn't set silly
637	scores. (This may not be as much of a problem as it was with the old
638	GA.)
639
640	This adds the following keys to the rules hashes:
641
642	=over 4
643
644	=item ismutable
645
646	Determines whether the perceptron can select a score for this test.
647
648	=item range_lo
649
650	Determines the lowest score the perceptron can set.
651
652	=item range_hi
653
654	Determines the highest score the perceptron can set.
655
656	=cut
657
658	sub do_score_ranges() {
659
660	my $self = shift;
661
662	if ( !$self->{_statistics} ) {
663	$self->do_statistics();
664	}
665	if ( !$self->{_rank} ) {
666	$self->do_rank();
667	}
668
669	foreach my $rule (values %{$self->{rules}}) {
670
671	my ($rank, $lo, $hi);
672
673	$rank = $rule->{rank};
674
675	# Get rid of rules that don't hit -- and disable completely.
676	if ($rule->{spam_percent} + $rule->{ham_percent} < 0.01 \|\|
677	$rule->{score} == 0) {
678
679	$rule->{ismutable} = 0;
680	$rule->{range_lo} = $rule->{range_hi} = 0;
681	next;
682
683	}
684
685	# next: get rid of tests that don't apply in this scoreset
686	# or are userconf -- set ismutable to 0, but keep the score
687	if ($rule->{tflags} =~ /\buserconf\b/ \|\|
688	(($self->{scoreset} % 2) == 0 && $rule->{tflags} =~/\bnet\b/)) {
689
690	$rule->{ismutable} = 0;
691	$rule->{range_lo} = $rule->{range_hi} = $rule->{score};
692	next;
693
694	}
695
696
697	# Normal rules:
698
699	# This seems to convert from [-1,1] to [0,1] but we're already in
700	# [0,1] space - Is this right?
701
702	# The current way ranks are calculated, > 0.5 and < 0.5 have no
703	# special meaning
704
705	# # 0.0 = best nice, 1.0 = best nonnice
706	# if ($rule->{isnice}) {
707	# $rank = .5 - ($rank / 2);
708	# } else {
709	# $rank = .5 + ($rank / 2);
710	# }
711
712	# ranks are really meant to be used as human readable - not as
713	# tools to choose score range - use soratio instead!
714
715	if ($self->{use_sliding_window}) {
716	($lo, $hi) = $self->sliding_window_ratio_to_range($rule->{soratio});
717	} else {
718	($lo, $hi) = $self->shrinking_window_ratio_to_range($rule->{soratio});
719	}
720
721	# Modify good rules to be lower
722	if ($rule->{isnice}) {
723	if ($rule->{tflags} =~ /\blearn\b/) { # learn rules should get
724	# higher scores (-5.4)
725	$lo *= 1.8;
726	}
727	elsif ( $rule->{soratio} <= 0.05 && $rule->{ham_percent} > 0.5) {
728	$lo *= 1.5;
729	}
730
731	# argh, ugly... but i'm copying it whole...
732	$hi = ($rule->{soratio} == 0) ? $lo :
733	($rule->{soratio} <= 0.005 ) ? $lo/1.1 :
734	($rule->{soratio} <= 0.010 && $rule->{ham_percent} > 0.2) ? $lo/2.0 :
735	($rule->{soratio} <= 0.025 && $rule->{ham_percent} > 1.5) ? $lo/10.0 :
736	0;
737
738	if ($rule->{soratio} >= 0.35 ) {
739	($lo, $hi) = (0,0);
740	}
741	}
742	else { # Make non-nice rules have higher scores if they're good
743	if ($rule->{tflags} =~ /\blearn\b/ ) {
744	$hi *= 1.8;
745	}
746	elsif ( $rule->{soratio} >= 0.99 && $rule->{spam_percent} > 1.0) {
747	$hi *= 1.5;
748	}
749
750	$lo = ($rule->{soratio} == 1) ? $hi:
751	($rule->{soratio} >= 0.995 ) ? $hi/4.0 :
752	($rule->{soratio} >= 0.990 && $rule->{spam_percent} > 1.0) ? $hi/8.0 :
753	($rule->{soratio} >= 0.900 && $rule->{spam_percent} > 10.0) ? $hi/24.0 :
754	0;
755
756	if ($rule->{soratio} <= 0.65 ) { # auto-disable bad rules
757	($lo, $hi) = (0,0);
758	}
759	}
760
761
762	# Some sanity checking
763	if($hi < $lo) {
764	($lo, $hi) = ($hi, $lo);
765	}
766
767
768	$rule->{ismutable} = ($lo == $hi) ? 0 : 1;
769	$rule->{range_lo} = $lo;
770	$rule->{range_hi} = $hi;
771
772	}
773	}
774
775	sub sliding_window_ratio_to_range {
776	my ($self, $ratio) = @_;
777
778	# (rough) graphic demo of this algorithm:
779	# 0.0 = -limit [......] 0 ........ limit
780	# 0.25 = -limit ..[..... 0 .]...... limit
781	# 0.5 = -limit ....[... 0 ...].... limit
782	# 0.75 = -limit ......[. 0 .....].. limit
783	# 1.0 = -limit ........ 0 [......] limit
784	my $sliding_window_limits = 4.8; # limits = [-$range, +$range]
785	my $sliding_window_size = 5.5; # scores have this range within limits
786
787	my $lo = -$sliding_window_limits + ($sliding_window_size * $ratio);
788	my $hi = +$sliding_window_limits - ($sliding_window_size * (1-$ratio));
789	if ($lo > $hi) {
790	($lo,$hi) = ($hi,$lo);
791	}
792	($lo, $hi);
793
794	}
795	sub shrinking_window_ratio_to_range {
796	my ($self, $ratio) = @_;
797
798
799	# 0.0 = -limit [......] 0 ........ limit
800	# 0.25 = -limit ....[... 0 ]....... limit
801	# 0.5 = -limit ......[. 0 .]...... limit (note: tighter)
802	# 0.75 = -limit .......[ 0 ...].... limit
803	# 1.0 = -limit ........ 0 [......] limit
804	my $shrinking_window_lower_base = 0.00;
805	my $shrinking_window_lower_range = 1.00; # *ratio, added to above
806	my $shrinking_window_size_base = 1.00;
807	my $shrinking_window_size_range = 1.00; # *ratio, added to above
808
809	my $is_nice = 0;
810	my $adjusted = ($ratio -.5) * 2; # adj [0,1] to [-1,1]
811	if ($adjusted < 0) { $is_nice = 1; $adjusted = -$adjusted; }
812
813	#$adjusted /= 1.5 if ( $ratio < 0.95 && $ratio > 0.15 ); # tvd
814
815	my $lower = $shrinking_window_lower_base
816	+ ($shrinking_window_lower_range * $adjusted);
817	my $range = $shrinking_window_size_base
818	+ ($shrinking_window_size_range * $adjusted);
819	my $lo = $lower;
820	my $hi = $lower + $range;
821	if ($is_nice) {
822	my $tmp = $hi; $hi = -$lo; $lo = -$tmp;
823	}
824	if ($lo > $hi) {
825	($lo,$hi) = ($hi,$lo);
826	}
827
828	($lo, $hi);
829	}
830
831
832	# Pacify perl
833	1;

Lines 16-400 Link Here

(-)hit-frequencies (-313 / +180 lines)
16	# limitations under the License.	16	# limitations under the License.
17	# </@LICENSE>	17	# </@LICENSE>
18		18
		19
19	use FindBin;	20	use FindBin;
20	use Getopt::Std;	21	use lib "$FindBin::Bin/../lib";
21	getopts("fm:M:X:l:L:pxhc:at:s:i");	22	use Mail::SpamAssassin::Masses;
		23	use Getopt::Long qw(:config bundling auto_help);
		24	use Pod::Usage;
		25	use strict;
		26	use warnings;
22		27
		28
23	use vars qw {	29	use vars qw {
24	$opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c	30	$opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c
25	$opt_a $opt_t $opt_s $opt_i $sorting	31	$opt_a $opt_t $opt_s $opt_z $opt_inclang $opt_auto
26	};	32	};
27		33
28	sub usage {	34	GetOptions("c\|cffile=s@" => \$opt_c,
29	die "hit-frequencies [-c rules dir] [-f] [-m RE] [-M RE] [-X RE] [-l LC]	35	"s\|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
30	[-s SC] [-a] [-p] [-x] [-i] [spam log] [ham log]	36	"l\|logfile=s" => \$opt_l,
		37	"f\|falses" => \$opt_f,
		38	"a\|all" => \$opt_a,
		39	"p\|percentages" => \$opt_p,
		40	"x\|extended" => \$opt_x,
		41	"m\|matchrule=s" => \$opt_m, #,
		42	"t\|tflags=s" => \$opt_t,
		43	"M\|matchlog=s" => \$opt_M,
		44	"X\|excludelog=s" => \$opt_X,
		45	"L\|language=s" => \$opt_L,
		46	"include-language=s" => \$opt_inclang);
31		47
32	-c p use p as the rules directory
33	-f falses. count only false-negative or false-positive matches
34	-m RE print rules matching regular expression
35	-t RE print rules with tflags matching regular expression
36	-M RE only consider log entries matching regular expression
37	-X RE don't consider log entries matching regular expression
38	-l LC also print language specific rules for lang code LC (or 'all')
39	-L LC only print language specific rules for lang code LC (or 'all')
40	-a display all tests
41	-p percentages. implies -x
42	-x extended output, with S/O ratio and scores
43	-s SC which scoreset to use
44	-i use IG (information gain) for ranking
45		48
46	options -l and -L are mutually exclusive.	49	=head1 NAME
47		50
48	options -M and -X are not mutually exclusive.	51	hit-frequencies - Display statistics about tests hit by a mass-check run
49		52
50	if either the spam or and ham logs are unspecified, the defaults	53	=head1 SYNOPSIS
51	are \"spam.log\" and \"ham.log\" in the cwd.
52		54
53	";	55	hit-frequencies [options]
54	}
55		56
56	usage() if($opt_h \|\| ($opt_l && $opt_L));	57	Options:
		58	-c,--cffile=path Use path as the rules directory
		59	-s,--scoreset=n Use scoreset n
		60	-l,--logfile=file Read in file instead of masses.log
		61	-f Count only false-positives/false-negatives
		62	-a Report all tests (including subrules)
		63	-p Report percentages instead of raw hits
		64	-x "Extended" output, include RANK, S/O and SCORE
		65	-m,--matchrule=re Print rules matching the regular expression
		66	-t,--tflags=re Print only rules with tflags matching the regular expression
		67	-M,--matchlog=re Consider only logs matching the regular expression
		68	-X,--excludelog=re Exclude logs matching this regular expression
		69	-L,--language=lc Only print language specific tests for specified lang code (try 'all')
		70	--include-language=lc Also print language specific tests for specified lang code (try 'all')
57		71
58	if ($opt_p) {	72	=head1 DESCRIPTION
59	$opt_x = 1;
60	}
61		73
62	$opt_s = 0 if ( !defined $opt_s );	74	B<hit-frequencies> will read the mass-check log F<masses.log> or the
		75	log given by the B<--logfile> option. The output will contain a
		76	summary of the number of ham and spam messages and detailed statistics
		77	for each rule. By default, B<hit-frequencies> will try to guess the
		78	proper values for B<--cffile> based on the header of the
		79	masses.log. The output will include the following columns:
63		80
64	my $cffile = $opt_c \|\| "$FindBin::Bin/../rules";	81	=over 4
65		82
66	my %freq_spam = ();	83	=item OVERALL
67	my %freq_ham = ();
68	my $num_spam = 0;
69	my $num_ham = 0;
70	my %ranking = ();
71	my $ok_lang = '';
72		84
73	readscores($cffile);	85	Number of times (or percentage with B<-p>) the rule hit on
		86	all messages (spam or ham).
74		87
75	$ok_lang = lc ($opt_l \|\| $opt_L \|\| '');	88	=item SPAM
76	if ($ok_lang eq 'all') { $ok_lang = '.'; }
77		89
78	foreach my $key (keys %rules) {	90	Number of times (or percentage with B<-p>) the rule hit on
		91	spam messages.
79		92
80	if ( ($opt_L && !$rules{$key}->{lang}) \|\|	93	=item HAM
81	($rules{$key}->{lang} &&
82	(!$ok_lang \|\| $rules{$key}->{lang} !~ /^$ok_lang/i)
83	) ) {
84	delete $rules{$key} ; next;
85	}
86		94
87	$freq_spam{$key} = 0;	95	Number of times (or percentage with B<-p>) the rule hit on
88	$freq_ham{$key} = 0;	96	ham messages.
89	}
90		97
91	readlogs();	98	=item S/O
92		99
93	my $hdr_all = $num_spam + $num_ham;	100	Shown only with B<-x> or B<-p>, this is the number of spam hits
94	my $hdr_spam = $num_spam;	101	divided by total number of hits (C<S/O> refers to spam divided by
95	my $hdr_ham = $num_ham;	102	overall).
96		103
97	if ($opt_p) {	104	=item RANK
98	my $sorting = $opt_i ? "IG" : "RANK";
99	if ($opt_f) {
100	printf "%7s %7s %7s %6s %6s %6s %s\n",
101	"OVERALL%", "FNEG%", "FPOS%", "S/O", $sorting, "SCORE", "NAME";
102	} else {
103	printf "%7s %7s %7s %6s %6s %6s %s\n",
104	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
105	}
106	printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n",
107	$hdr_all, $hdr_spam, $hdr_ham,
108	soratio ($num_spam,$num_ham), 0, 0;
109		105
110	$hdr_spam = ($num_spam / $hdr_all) * 100.0;	106	Shown only with B<-x> or B<-p>, this is a measure that attempts to
111	$hdr_ham = ($num_ham / $hdr_all) * 100.0;	107	indicate how I<good> or I<useful> a test is. The higher it is, the
112	$hdr_all = 100.0; # this is obvious	108	better the test.
113	printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n",
114	$hdr_all, $hdr_spam, $hdr_ham,
115	soratio ($num_spam,$num_ham), 0, 0;
116		109
117	} elsif ($opt_x) {	110	=item SCORE
118	printf "%7s %7s %7s %6s %6s %6s %s\n",
119	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
120	printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n",
121	$hdr_all, $hdr_spam, $hdr_ham,
122	soratio ($num_spam,$num_ham), 0, 0;
123		111
124	} else {	112	Shown only with B<-x> or B<-p>, this is the current score assigned to
125	printf "%10s %10s %10s %s\n",	113	the rule.
126	"OVERALL", "SPAM", "HAM", "NAME";
127	printf "%10d %10d %10d (all messages)\n",
128	$hdr_all, $hdr_spam, $hdr_ham;
129	}
130		114
131	my %done = ();	115	=item NAME
132	my @tests = ();
133	my $rank_hi = 0;
134	my $rank_lo = 9999999;
135		116
136	# variables for wanted/unwanted RANK	117	This is the rule's name.
137	my %wanted;
138	my %unwanted;
139	my %wranks;
140	my %uranks;
141		118
142	foreach my $test (keys %freq_spam, keys %freq_ham) {	119	=back
143	next unless (exists $rules{$test}); # only valid tests
144	next if (!$opt_a && $rules{$test}->{issubrule});
145		120
146	next if $done{$test}; $done{$test} = 1;	121	=head1 BUGS
147	push (@tests, $test);
148		122
149	my $isnice = 0;	123	Please report bugs to http://bugzilla.spamassassin.org/
150	if ($rules{$test}->{tflags} =~ /nice/) { $isnice = 1; }
151		124
152	my $fs = $freq_spam{$test}; $fs \|\|= 0;	125	=head1 SEE ALSO
153	my $fn = $freq_ham{$test}; $fn \|\|= 0;
154	my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
155	my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;
156		126
157	my $soratio = $soratio{$test} = soratio ($fsadj, $fnadj);	127	L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
158		128
159	if ($isnice) {	129	=cut
160	$soratio = 1.0 - $soratio;
161	my $tmp = $fsadj; $fsadj = $fnadj; $fnadj = $tmp;
162	}
163		130
164	if ($opt_i) {	131	if ($opt_L && $opt_inclang) {
165	# come up with a ranking	132	pod2usage("-L/--language and --include-language are mutually exclusive");
166	my $rank;
167
168	# New new system: from "Learning to Filter Unsolicited Commercial E-Mail",
169	# Ion Androutsopoulos et al: determine the information gain IG(X, C) of the
170	# Boolean attributes (ie. the rules). Measures "the average reduction in
171	# the entropy of C (classification) given the value of X (the rule)". Makes
172	# a good ranking measure with a proper statistical basis. ;)
173	#
174	# Still would like to get an entropy measure in, too.
175	#
176	# sum P(X = x ^ C = c)
177	# IG(X,C) = x in [0, 1] P(X = x ^ C = c) . log2( ------------------- )
178	# c in [Ch, Cs] P(X = x) . P(C = c)
179	#
180	my $safe_nspam = $num_spam \|\| 0.0000001;
181	my $safe_nham = $num_ham \|\| 0.0000001;
182
183	my $num_all = ($num_spam + $num_ham);
184	my $safe_all = $num_all \|\| 0.0000001;
185	my $f_all = $fs+$fn;
186
187	my $px0 = (($num_all - $f_all) / $safe_all); # P(X = 0)
188	my $px1 = ($f_all / $safe_all); # P(X = 1)
189	my $pccs = ($num_spam / $safe_all); # P(C = Cs)
190	my $pcch = ($num_ham / $safe_all); # P(C = Ch)
191	my $px1ccs = ($fs / $safe_nspam); # P(X = 1 ^ C = Cs)
192	my $px1cch = ($fn / $safe_nham); # P(X = 1 ^ C = Ch)
193	my $px0ccs = (($num_spam - $fs) / $safe_nspam); # P(X = 0 ^ C = Cs)
194	my $px0cch = (($num_ham - $fn) / $safe_nham); # P(X = 0 ^ C = Ch)
195	my $safe_px0_dot_pccs = ($px0 * $pccs) \|\| 0.00000001;
196	my $safe_px0_dot_pcch = ($px0 * $pcch) \|\| 0.00000001;
197	my $safe_px1_dot_pccs = ($px1 * $pccs) \|\| 0.00000001;
198	my $safe_px1_dot_pcch = ($px1 * $pcch) \|\| 0.00000001;
199
200	sub log2 { return log($_[0]) / 0.693147180559945; } # log(2) = 0.6931...
201
202	my $safe_px0ccs = ($px0ccs \|\| 0.0000001);
203	my $safe_px0cch = ($px0cch \|\| 0.0000001);
204	my $safe_px1ccs = ($px1ccs \|\| 0.0000001);
205	my $safe_px1cch = ($px1cch \|\| 0.0000001);
206	$rank = ( $px0ccs * log2($safe_px0ccs / $safe_px0_dot_pccs) ) +
207	( $px0cch * log2($safe_px0cch / $safe_px0_dot_pcch) ) +
208	( $px1ccs * log2($safe_px1ccs / $safe_px1_dot_pccs) ) +
209	( $px1cch * log2($safe_px1cch / $safe_px1_dot_pcch) );
210
211	$ranking{$test} = $rank;
212	$rank_hi = $rank if ($rank > $rank_hi);
213	$rank_lo = $rank if ($rank < $rank_lo);
214	}
215	else {
216	# basic wanted/unwanted ranking
217	$wanted{$test} = $isnice ? $fn : $fs;
218	$unwanted{$test} = $isnice ? $fs : $fn;
219	# count number of ranks of each type
220	$wranks{$wanted{$test}} = 1;
221	$uranks{$unwanted{$test}} = 1;
222	}
223	}	133	}
224		134
225	# finish basic wanted/unwanted ranking	135	if ($opt_p) {
226	if (! $opt_i) {	136	$opt_x = 1;
227	my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
228	my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;
229
230	# first half of ranking is the wanted rank
231	my $position = 0;
232	my $last = undef;
233	for my $test (@wanted) {
234	$position++ if defined $last && $last != $wanted{$test};
235	$ranking{$test} += $position;
236	$last = $wanted{$test}
237	}
238
239	# second half of ranking is the unwanted rank
240	my $normalize = (scalar keys %wranks) / (scalar keys %uranks);
241	$position = 0;
242	$last = undef;
243	for my $test (@unwanted) {
244	$position++ if defined $last && $last != $unwanted{$test};
245	$ranking{$test} += ($position * $normalize);
246	$last = $unwanted{$test};
247	$rank_hi = $ranking{$test} if ($ranking{$test} > $rank_hi);
248	$rank_lo = $ranking{$test} if ($ranking{$test} < $rank_lo);
249	}
250	}	137	}
251		138
252	{	139	$opt_s = 0 if ( !defined $opt_s );
253	# now normalise the rankings to [0, 1]
254	$rank_hi -= $rank_lo;
255	foreach $test (@tests) {
256	$ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi);
257	}
258	}
259		140
260	foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) {	141	my $ok_lang = lc ( $opt_inclang \|\| $opt_L \|\| '');
261	next unless (exists $rules{$test}); # only valid tests	142	$ok_lang = '.' if ($ok_lang eq 'all');
262	next if (!$opt_a && $rules{$test}->{issubrule});
263		143
264	my $fs = $freq_spam{$test}; $fs \|\|= 0;	144	my $greprules = sub { # To determine whether rule should be read
265	my $fn = $freq_ham{$test}; $fn \|\|= 0;	145	my ($name, $rule) = @_;
266	my $fa = $fs+$fn;
267		146
268	next if ($opt_m && $test !~ m/$opt_m/); # match certain tests	147	return 0 if ($opt_m && $name !~ /$opt_m/); # name doesn't match -m
269	next if ($opt_t && $rules{$test}->{tflags} !~ /$opt_t/); # match tflags	148	# expression
		149	return 0 if ($opt_t && $rule->{tflags} !~ /$opt_t/); # tflags don't
		150	# match -t
		151	# expression
		152	return 0 if (($opt_L && !$rule->{lang}) \|\|
		153	($rule->{lang} &&
		154	(!$ok_lang \|\| $rule->{lang} !~ /^$ok_lang/i))); # Wrong language
270		155
		156	return 0 if ($rule->{issubrule} && !$opt_a);
		157
271	if (!$opt_a && !$opt_t) {	158	if (!$opt_a && !$opt_t) {
272	next if ($rules{$test}->{tflags} =~ /net/ && ($opt_s % 2 == 0)); # not net tests	159	return 0 if ($rule->{tflags} =~ /net/ && ($opt_s % 2 == 0));
273	next if ($rules{$test}->{tflags} =~ /userconf/); # or userconf	160	return 0 if ($rule->{tflags} =~ /userconf/); # or userconf
274	}	161	}
		162	return 1;
275		163
276	# adjust based on corpora sizes (and cvt to % while we're at it)	164	};
277	my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
278	my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;
279		165
280	if ($opt_f && $fsadj == 0 && $fnadj == 0) { next; }
281		166
282	if ($opt_p) {	167	my $logfile = $opt_l \|\| "masses.log";
283	$fa = ($fa / ($num_spam + $num_ham)) * 100.0;
284	$fs = $fsadj;
285	$fn = $fnadj;
286	}
287		168
288	my $soratio = $soratio{$test};	169	if (!$opt_c \|\| !scalar(@$opt_c)) {
289	if (!defined $soratio) {	170	# Try to read this in from the log, if possible
290	$soratio{$test} = soratio ($fsadj, $fnadj);	171	open IN, $logfile or die "Can't open $logfile: $!";
291	}	172	my $files = 0; # are we in the files section?
		173	while(<IN>) {
		174	if (!$files) {
		175	if (/^\# SVN revision:/) {
		176	$opt_c = [ "$FindBin::Bin/../rules" ];
		177	last;
		178	} elsif (/^\# Using configuration:$/) {
		179	$files = 1;
		180	}
		181	} elsif (/^\#\s+(.)\s$/) {
		182	push (@$opt_c, $1);
		183	} else {
		184	# All done!
		185	last;
		186	}
		187	}
292		188
293	if ($opt_p) {	189	foreach my $file (@$opt_c) {
294	printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f %s\n",	190	die "Can't read $file" unless -r $file;
295	$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test;	191	}
296
297	} elsif ($opt_x) {
298	printf "%7d %7d %7d %7.3f %6.2f %6.2f %s\n",
299	$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test;
300
301	} else {
302	printf "%10d %10d %10d %s\n", $fa, $fs, $fn, $test;
303	}
304	}	192	}
305	exit;	193
		194	my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
		195	scoreset => $opt_s,
		196	falsesonly => $opt_f,
		197	greprules => $greprules,
		198	logfile => $logfile,
		199	nologs => 1});
306		200
		201	$masses->readrules();
		202	$masses->readlogs();
		203	$masses->do_statistics();
		204	$masses->do_rank();
307		205
		206	my $rules = $masses->get_rules_hash();
		207	my $num_ham = $masses->get_num_ham();
		208	my $num_spam = $masses->get_num_spam();
		209	my $num_all = $num_ham + $num_spam;
308		210
309	sub readlogs {	211	if ($num_ham + $num_spam <= 0) {
310	my $spam = $ARGV[0] \|\| "spam.log";	212	die "Can't run hit-frequencies on 0 messages.";
311	my $ham = $ARGV[1] \|\| (-f "good.log" ? "good.log" : "ham.log");	213	}
312		214
313	foreach my $file ($spam, $ham) {	215	## Write header
314	open (IN, "<$file") \|\| die "Could not open file '$file': $!";
315		216
316	my $isspam = 0; ($file eq $spam) and $isspam = 1;	217	if ($opt_p) {
317		218
318	while (<IN>) {	219	if ($opt_f) {
319	next if (/^#/);	220	printf "%7s %7s %7s %6s %6s %6s %s\n",
320	next unless (!$opt_M \|\| /$opt_M/o);	221	"OVERALL%", "FNEG%", "FPOS%", "S/O", "RANK", "SCORE", "NAME";
321	next if ($opt_X && /$opt_X/o);	222	} else {
		223	printf "%7s %7s %7s %6s %6s %6s %s\n",
		224	"OVERALL%", "SPAM%", "HAM%", "S/O", "RANK", "SCORE", "NAME";
		225	}
322		226
323	/^(.)\s+(-?\d+)\s+(\S+)\s(\S)/ or next;	227	printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n",
324	my $caught = ($1 eq 'Y');	228	$num_all, $num_spam, $num_ham,
325	my $hits = $2;	229	$num_spam / $num_all, 0, 0;
326	$_ = $4; s/,,+/,/g;
327		230
328	if ($isspam) {	231	printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n",
329	if ($opt_f) {	232	100.0, $num_spam / $num_all * 100.0, $num_ham / $num_all * 100.0,
330	if (!$caught) { $num_spam++; }	233	$num_spam / $num_all, 0, 0;
331	} else {
332	$num_spam++;
333	}
334	} else {
335	if ($opt_f) {
336	if ($caught) { $num_ham++; }
337	} else {
338	$num_ham++;
339	}
340	}
341		234
342	my @tests = split (/,/, $_);	235	} elsif ($opt_x) {
343	foreach my $t (@tests) {	236	printf "%7s %7s %7s %6s %6s %6s %s\n",
344	next if ($t eq '');	237	"OVERALL", "SPAM", "HAM", "S/O", "RANK", "SCORE", "NAME";
345	if ($isspam) {	238	printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n",
346	if ($opt_f) {	239	$num_all, $num_spam, $num_ham,
347	if (!$caught) { $freq_spam{$t}++; }	240	$num_spam / $num_all, 0, 0;
348	} else {
349	$freq_spam{$t}++;
350	}
351	} else {
352	if ($opt_f) {
353	if ($caught) { $freq_ham{$t}++; }
354	} else {
355	$freq_ham{$t}++;
356	}
357	}
358	}
359	}
360	close IN;
361	}
362	}
363		241
364		242	} else {
365	sub readscores {	243	printf "%10s %10s %10s %s\n",
366	my($cffile) = @_;	244	"OVERALL", "SPAM", "HAM", "NAME";
367	system ("$FindBin::Bin/parse-rules-for-masses -d \"$cffile\" -s $opt_s") and die;	245	printf "%10d %10d %10d (all messages)\n",
368	require "./tmp/rules.pl";	246	$num_all, $num_spam, $num_ham;
369	}	247	}
370		248
371	sub soratio {	249	foreach my $test (sort { $rules->{$b}->{rank} <=> $rules->{$a}->{rank} } keys %{$rules}) {
372	my ($s, $n) = @_;
373		250
374	$s \|\|= 0;	251	if ($opt_p) {
375	$n \|\|= 0;	252	printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f %s\n",
376		253	($rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham}) / $num_all * 100.0,
377	if ($s + $n > 0) {	254	$rules->{$test}->{spam_percent}, $rules->{$test}->{ham_percent},
378	return $s / ($s + $n);	255	$rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test;
		256	} elsif ($opt_x) {
		257	printf "%7d %7d %7d %7.3f %6.2f %6.2f %s\n",
		258	$rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham},
		259	$rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham},
		260	$rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test;
379	} else {	261	} else {
380	return 0.5; # no results -> not effective	262	printf "%10d %10d %10d %s\n",
		263	$rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham},
		264	$rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham}, $test;
381	}	265	}
382	}	266	}
383		267
384	sub tcr {
385	my ($nspam, $nlegit, $nspamspam, $nlegitspam) = @_;
386	my $nspamlegit = $nspam - $nspamspam;
387	my $nlegitlegit = $nlegit - $nlegitspam;
388
389	my $lambda = 99;
390
391	my $werr = ($lambda * $nlegitspam + $nspamlegit)
392	/ ($lambda * $nlegit + $nspam);
393
394	my $werr_base = $nspam
395	/ ($lambda * $nlegit + $nspam);
396
397	$werr \|\|= 0.000001; # avoid / by 0
398	my $tcr = $werr_base / $werr;
399	return $tcr;
400	}

Lines 16-47 Link Here

(-)rewrite-cf-with-new-scores (-21 / +110 lines)
16	# limitations under the License.	16	# limitations under the License.
17	# </@LICENSE>	17	# </@LICENSE>
18		18
		19	=head1 NAME
		20
		21	rewrite-cf-with-new-scores - Rewrite SpamAssassin scores file with new
		22	scores.
		23
		24	=head1 SYNOPSIS
		25
		26	rewrite-cf-with-new-scores [options]
		27
		28	Options
		29	--old-scores=file Read file containing the old SpamAssassin scores
		30	--new-scores=file Read file containing the new SpamAssassin scores
		31	-s,--scoreset n Rewrite scoreset n
		32	--output=file Output rewritten score file to file
		33	-c,--cffile=path Use path as the rules directory
		34	-l,--logfile=file Use file instead of masses.log (for guessing -c)
		35
		36	Note: these options can be shortened (i.e. --old, --new, --out) as
		37	long as they are unambiguous.
		38
		39	=head1 DESCRIPTION
		40
		41	B<rewrite-cf-with-new-scores> is a tool to update the sitewide scores
		42	file with the newly generated scores. Since SpamAssassin has four
		43	different scoresets, which each need to be generated separately, this
		44	tool is used to only change the correct scoreset.
		45
		46	By default, the old scores are read from 50_scores.cf in the rules
		47	directory and the new ones from ./perceptron.scores. The output will
		48	be ./50_scores.cf by default.
		49
		50	The rules directory needs to be used to make sure scores are given for
		51	the right tests. Rules not found in the rules directory will not be
		52	given scores in the output.
		53
		54	=head1 BUGS
		55
		56	Please report bugs to http://bugzilla.spamassassin.org/
		57
		58	=head1 SEE ALSO
		59
		60	L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
		61
		62	=cut
		63
		64	use FindBin;
		65	use lib "$FindBin::Bin/../lib";
		66	use Getopt::Long qw(:config bundling auto_help);
		67	use Mail::SpamAssassin::Masses;
		68	use Pod::Usage;
		69	use strict;
		70	use warnings;
		71
		72	use vars qw($opt_old $opt_new $opt_scoreset $opt_out $opt_c $opt_l);
		73
		74	GetOptions("old-scores=s" => \$opt_old,
		75	"new-scores=s" => \$opt_new,
		76	"s\|scoreset=i" => \$opt_scoreset,
		77	"output=s" => \$opt_out,
		78	"c\|cffile=s@" => \$opt_c,
		79	"l\|logfile=s" => \$opt_l);
		80
		81	$opt_l \|\|= "masses.log";
		82	$opt_scoreset = 0 unless defined $opt_scoreset;
83
19	my $NUM_SCORESETS = 4;	84	my $NUM_SCORESETS = 4;
20		85
21	my ($scoreset,$oldscores,$newscores) = @ARGV;	86	if (!$opt_c \|\| !scalar(@$opt_c)) {
		87	# Try to read this in from the log, if possible
		88	open IN, $opt_l or die "Can't open $opt_l: $!";
		89	my $files = 0; # are we in the files section?
		90	while(<IN>) {
		91	if (!$files) {
		92	if (/^\# SVN revision:/) {
		93	$opt_c = [ "$FindBin::Bin/../rules" ];
		94	last;
		95	} elsif (/^\# Using configuration:$/) {
		96	$files = 1;
		97	}
		98	} elsif (/^\#\s+(.)\s$/) {
		99	push (@$opt_c, $1);
		100	} else {
		101	# All done!
		102	last;
		103	}
		104	}
22		105
23	$scoreset = int($scoreset) if defined $scoreset;	106	foreach my $file (@$opt_c) {
24	if (!defined $newscores \|\| $scoreset < 0 \|\| $scoreset >= $NUM_SCORESETS ) {	107	die "Can't read $file" unless -r $file;
25	die "usage: rewrite-cf-with-new-scores scoreset oldscores.cf newscores.cf\n";	108	}
26	}	109	}
27		110
28	system ("./parse-rules-for-masses -s $scoreset") and die;	111	if (!$opt_old) {
29	if (-e "tmp/rules.pl") {	112	$opt_old = $$opt_c[0] . "/50_scores.cf";
30	# Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem
31	require "./tmp/rules.pl";
32	}	113	}
33	else {
34	die "parse-rules-for-masses had no error but no tmp/rules.pl!?!";
35	}
36		114
		115	$opt_new \|\|= "50_scores.cf";
		116
		117	my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
		118	scoreset => $opt_scoreset});
		119
		120	$masses->readrules();
		121	my $rules = $masses->get_rules_hash();
		122
37	# now read the generated scores	123	# now read the generated scores
38	my @gascoreorder = ();	124	my @gascoreorder = ();
		125	my %oldscores = ();
39	my %gascorelines = ();	126	my %gascorelines = ();
40	open (STDIN, "<$newscores") or die "cannot open $newscores";	127	open (STDIN, "<$opt_new") or die "cannot open $opt_new";
41	while (<STDIN>) {	128	while (<STDIN>) {
42	/^score\s+(\S+)\s+(-?\d+(?:\.\d+)?)/ or next;	129	/^score\s+(\S+)\s+(-?\d+(?:\.\d+)?)/ or next;
43	my $name = $1; my $score = $2;	130	my $name = $1; my $score = $2;
44	next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);	131	next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule});
45	next if ($name =~ /^__/);	132	next if ($name =~ /^__/);
46	next if ($name eq '(null)'); # er, oops ;)	133	next if ($name eq '(null)'); # er, oops ;)
47		134
Lines 49-55 Link Here
49	push (@gascoreorder, $name);	136	push (@gascoreorder, $name);
50	}	137	}
51		138
52	open (IN, "<$oldscores") or die "cannot open $oldscores";	139	open (IN, "<$opt_old") or die "cannot open $opt_old";
53	my $out = '';	140	my $out = '';
54	my $pre = '';	141	my $pre = '';
55		142
Lines 58-64 Link Here
58	while (<IN>) {	145	while (<IN>) {
59	if (/^\s*score\s+(\S+)\s/) {	146	if (/^\s*score\s+(\S+)\s/) {
60	delete $gascorelines{$1};	147	delete $gascorelines{$1};
61	next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);	148	next unless (exists ($rules->{$1}) && $rules->{$1}->{issubrule} == 0);
62	}	149	}
63	$pre .= $_;	150	$pre .= $_;
64	/^# Start of generated scores/ and last;	151	/^# Start of generated scores/ and last;
Lines 82-91 Link Here
82	if (/^\s*score\s+\S+/) {	169	if (/^\s*score\s+\S+/) {
83	my($score,$name,@scores) = split;	170	my($score,$name,@scores) = split;
84		171
85	next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);	172	next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule});
86	if (defined $gascorelines{$name}) {	173	if (defined $gascorelines{$name}) {
87	# Set appropriate scoreset value	174	# Set appropriate scoreset value
88	$scores[$scoreset] = $gascorelines{$name};	175	$scores[$opt_scoreset] = $gascorelines{$name};
89		176
90	# Create new score line	177	# Create new score line
91	$_ = join(" ","score",$name,generate_scores(@scores))."\n";	178	$_ = join(" ","score",$name,generate_scores(@scores))."\n";
Lines 96-103 Link Here
96	}	183	}
97	close IN;	184	close IN;
98		185
		186	open OUT, ">$opt_out" or die "Can't open $opt_out: $!";
		187
99	# and output the lot	188	# and output the lot
100	print $pre, "\n";	189	print OUT $pre, "\n";
101	foreach my $name (@gascoreorder) {	190	foreach my $name (@gascoreorder) {
102	$_ = $gascorelines{$name};	191	$_ = $gascorelines{$name};
103	next unless (defined ($_));	192	next unless (defined ($_));
Lines 107-118 Link Here
107	@scores = @{$oldscores{$name}} if ( exists $oldscores{$name} );	196	@scores = @{$oldscores{$name}} if ( exists $oldscores{$name} );
108		197
109	# Set appropriate scoreset value	198	# Set appropriate scoreset value
110	$scores[$scoreset] = $_;	199	$scores[$opt_scoreset] = $_;
111		200
112	# Create new score line	201	# Create new score line
113	print join(" ","score",$name,generate_scores(@scores)),"\n";	202	print OUT join(" ","score",$name,generate_scores(@scores)),"\n";
114	}	203	}
115	print "\n", $out, "\n";	204	print OUT "\n", $out, "\n";
116		205
117	sub generate_scores {	206	sub generate_scores {
118	my (@scores) = @_;	207	my (@scores) = @_;

Lines 4-36 Link Here

(-)Makefile (-15 / +8 lines)
4		4
5	# What rule scoreset are we using?	5	# What rule scoreset are we using?
6	SCORESET = 0	6	SCORESET = 0
		7	LOGFILE = masses.log
7		8
8	#### Should be no need to modify below this line	9	#### Should be no need to modify below this line
9		10
10	all: badrules perceptron	11	all: badrules perceptron
11		12
12	perceptron: perceptron.o	13	perceptron: perceptron.o
13	$(CC) -o perceptron perceptron.o $(LDFLAGS)	14	$(CC) -o perceptron perceptron.o $(LDFLAGS)
14		15
15	perceptron.o: tmp/rules.pl tmp/tests.h tmp/scores.h	16	perceptron.o: tmp/tests.h
16	$(CC) $(CFLAGS) -c -o perceptron.o perceptron.c	17	$(CC) $(CFLAGS) -c -o perceptron.o perceptron.c
17		18
18	tmp/rules.pl: tmp/.created parse-rules-for-masses	19	tmp/tests.h: tmp/.created logs-to-c
19	perl parse-rules-for-masses -d ../rules -s $(SCORESET)	20	perl logs-to-c --scoreset=$(SCORESET) --logfile=$(LOGFILE)
20		21
21	tmp/tests.h: tmp/.created tmp/ranges.data logs-to-c	22	freqs: masses.log
22	perl logs-to-c --scoreset=$(SCORESET)	23	perl hit-frequencies -x -p -s $(SCORESET) --logfile=$(LOGFILE) > freqs
23		24
24	tmp/scores.h: tmp/tests.h
25
26	tmp/ranges.data: tmp/.created freqs score-ranges-from-freqs
27	perl score-ranges-from-freqs ../rules $(SCORESET) < freqs
28
29	freqs: spam.log ham.log
30	perl hit-frequencies -x -p -s $(SCORESET) > freqs
31
32	badrules: freqs	25	badrules: freqs
33	perl lint-rules-from-freqs < freqs > badrules	26	perl lint-rules-from-freqs -s $(SCORESET) --logfile=$(LOGFILE) > badrules
34		27
35	tmp/.created:	28	tmp/.created:
36	-mkdir tmp	29	-mkdir tmp

Lines 16-139 Link Here

(-)lint-rules-from-freqs (-244 / +167 lines)
16	# limitations under the License.	16	# limitations under the License.
17	# </@LICENSE>	17	# </@LICENSE>
18		18
		19	=head1 NAME
		20
		21	lint-rules-from-freqs - Try to find problems with SpamAssassin rules
		22
		23	=head1 SYNOPSIS
		24
		25	lint-rules-from-freqs [options]
		26
		27	Options:
		28	-c,--cffile=path Use path as the rules directory
		29	-s,--scoreset=n Use scoreset n
		30	-l,--logfile=file Read in file instead of masses.log
		31	-f Also take into account false positives/negatives
		32
		33	=head1 DESCRIPTION
		34
		35	This script analyzes SpamAssassin tests, based on the hit frequencies
		36	and S/O ratios from a mass-check log (masses.log). This script can
		37	also optionally take into account the false positive/negative
		38	frequencies.
		39
		40	The script first uses the SpamAssassin rules parser to report on any
		41	illegal syntax. Then it checks the rules match frequencies from the
		42	mass-check log in order to determine how effective the rule is.
		43
		44	=head1 BUGS
		45
		46	Please report bugs to http://bugzilla.spamassassin.org/
		47
		48	=head1 SEE ALSO
		49
		50	L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
		51
		52	=cut
		53
		54
		55	use FindBin;
		56	use lib "$FindBin::Bin/../lib";
		57	use Mail::SpamAssassin::Masses;
		58	use Mail::SpamAssassin;
		59	use Getopt::Long qw(:config bundling auto_help);
		60	use strict;
		61	use warnings;
		62
19	# any tests that get less than this % of matches on both spam or nonspam, are	63	# any tests that get less than this % of matches on both spam or nonspam, are
20	# reported.	64	# reported.
21	my $LOW_MATCHES_PERCENT = 0.03;	65	my $LOW_MATCHES_PERCENT = 0.03;
22	my $scoreset = 0;
23		66
		67	use vars qw($opt_c $opt_l $opt_s $opt_f $opt_p);
		68
		69	GetOptions("c\|cffile=s@" => \$opt_c,
		70	"s\|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
		71	"l\|logfile=s" => \$opt_l,
		72	"f\|falses" => \$opt_f);
		73
		74
		75	$opt_s = 0 unless defined $opt_s;
		76	$opt_l \|\|= "masses.log";
		77
24	sub usage {	78	sub usage {
25	die "	79	die "
26	lint-rules-from-freqs: perform 'lint' testing on SpamAssassin rules and scores	80	lint-rules-from-freqs: perform 'lint' testing on SpamAssassin rules and scores
27		81
28	usage: ./lint-rules-from-freqs [-f falsefreqs] < freqs > badtests	82	usage: ./lint-rules-from-freqs [-c rules dir] [-l logfile] [-s scoreset] [-f]
29		83
30	This analyzes SpamAssassin tests, based on the hit frequencies and S/O ratios	84	-c p Use p as rules directory
31	from a mass-check logfile pair.	85	-l f Use f as log fine
		86	-s n Use n as score set
		87	-f Check also for false positives/negatives
32		88
33	The 'freqs' argument is the frequency of hits in all messages ('hit-frequencies
34	-x -p' output).
35		89
36	The 'falsefreqs' argument is frequencies of hits in false-positives and
37	false-negatives only ('hit-frequencies -x -p -f' output).
38
39	";	90	";
40	}	91	}
41		92
42	my $opt_falsefreqs;	93
43	while ($#ARGV >= 0) {	94	if (!$opt_c \|\| !scalar(@$opt_c)) {
44	$_ = shift @ARGV;	95	# Try to read this in from the log, if possible
45	if (/^-f/) { $_ = shift @ARGV; $opt_falsefreqs = $_; }	96	open IN, $opt_l or die "Can't open $opt_l: $!";
46	elsif (/^-s/) { $_ = shift @ARGV; $scoreset = $_; }	97	my $files = 0; # are we in the files section?
47	else { usage(); }	98	while(<IN>) {
		99	if (!$files) {
		100	if (/^\# SVN revision:/) {
		101	$opt_c = [ "$FindBin::Bin/../rules" ];
		102	last;
		103	} elsif (/^\# Using configuration:$/) {
		104	$files = 1;
		105	}
		106	} elsif (/^\#\s+(.)\s$/) {
		107	push (@$opt_c, $1);
		108	} else {
		109	# All done!
		110	last;
		111	}
		112	}
		113
		114	foreach my $file (@$opt_c) {
		115	die "Can't read $file" unless -r $file;
		116	}
48	}	117	}
49		118
50	print "BAD TESTS REPORT\n";	119	print "BAD TESTS REPORT\n";
51	readrules();	120	# First, do a --lint
52	print "\n" .((scalar keys %rulefile) + 1). " rules found.\n";	121
53	print "\nRule file syntax issues:\n\n";	122	print "\nRule file syntax issues:\n\n";
54	lintrules();
55		123
56	if ($opt_falsefreqs) {	124	{
57	open (FALSE, "<$opt_falsefreqs");	125	local (STDERR) = \STDOUT; # Get lint errors on STDOUT
58	while (<FALSE>) {	126
59	if (!/^\s*([\d\.]+)/) {	127	# Read the config ourselves...
60	my ($overall, $spam, $nons, $so, $score, $name) = split (' ');	128
61	next unless ($name =~ /\S/);	129	# Read init.pre from each directory, then glob for the rest.
62	$falsefreqs_spam{$name} = $spam;	130
63	$falsefreqs_nons{$name} = $nons;	131	my $cf_txt = '';
64	$falsefreqs_so{$name} = $so;	132	my @files;
		133	my @dirs;
		134	foreach my $file (@$opt_c) {
		135	if (-d $file) {
		136	if (-r "$file/init.pre") {
		137	push @files, "$file/init.pre";
		138	}
		139	push @dirs, $file;
65	}	140	}
		141	else {
		142	push @files, $file;
		143	}
66	}	144	}
67	close FALSE;	145	foreach my $dir (@dirs) {
68	}	146	my @cfs = glob("$dir/*.cf");
		147	push @files, grep { -r $_ } @cfs;
		148	}
69		149
70	while (<>) {	150	foreach my $file (@files) {
71	if (!/^\s*([\d\.]+)/) {	151	if (-r $file) {
72	$output{'a_header'} = $_; next;	152	open IN, $file;
		153	$cf_txt .= "file start $file\n";
		154	$cf_txt .= join('', <IN>);
		155	$cf_txt .= "\nfile end $file\n";
		156	close IN;
		157	}
73	}	158	}
74		159
		160	my $spamtest = new Mail::SpamAssassin({config_text => $cf_txt});
		161
		162	$spamtest->lint_rules();
		163	}
		164
		165
		166	# Next, check for other stuff
		167	my $masses = Mail::SpamAssassin::Masses->new({rulesdir => $opt_c,
		168	scoreset => $opt_s, #,,
		169	falses => $opt_f,
		170	logfile => $opt_l});
		171
		172	$masses->readlogs();
		173	$masses->do_statistics();
		174
		175	my $rules = $masses->get_rules_array();
		176
		177
		178	my %output;
		179
		180	foreach my $rule (@$rules) {
		181
75	my $badrule;	182	my $badrule;
76	my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
77	next unless ($name =~ /\S/);
78		183
79	my $ffspam = $falsefreqs_spam{$name};	184	next if ($rule->{tflags} =~ /\bnet\b/ && ($opt_s % 2) == 0);
80	my $ffnons = $falsefreqs_nons{$name};	185	next if ($rule->{tflags} =~ /\buserconf\b/);
81	my $ffso = $falsefreqs_so{$name};
82		186
83	my $tf = $tflags{$name};	187	if ($rule->{freq_spam} == 0 && $rule->{freq_ham} == 0) { # sanity!
84	next if ($tf =~ /net/ && ($scoreset % 2) == 0);
85	next if ($tf =~ /userconf/);
86		188
87	if ($overall == 0.0 && $spam == 0.0 && $nons == 0.0) { # sanity!
88	$badrule = 'no matches';	189	$badrule = 'no matches';
89		190
90	} else {	191	} else {
91	if ($score < 0.0) {	192	if ($rule->{score} < 0.0) {
92	# negative score with more spams than nonspams? bad rule.	193	# negative score with more spams than nonspams? bad rule.
93	if ($tf !~ /nice/ && $so > 0.5 && $score < 0.5) {	194	if (!$rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) {
94	$badrule = 'non-nice but -ve score';	195	$badrule = 'non-nice but -ve score';
95	}	196	}
96		197	if ($rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) {
97	if ($tf =~ /nice/ && $so > 0.5 && $score < 0.5) {	198	if ($opt_f && $rule->{freq_fn} < $rule->{freq_fp}) {
98	if ($ffso < 0.5) {
99	$badrule = 'fn';	199	$badrule = 'fn';
100	} else {
101	# ignore, the FNs are overridden by other tests so it doesn't
102	# affect the overall results.
103	}	200	}
		201	# else {
		202	# ignore, the FNs are overridden by other tests so it doesn't
		203	# affect the overall results.
		204	# }
104	}	205	}
105		206
106	# low number of matches overall	207	# low number of matches overall
107	if ($nons < $LOW_MATCHES_PERCENT)	208	if ($rule->{ham_percent} < $LOW_MATCHES_PERCENT)
108	{ $badrule \|\|= ''; $badrule .= ', low matches'; }	209	{ $badrule \|\|= ''; $badrule .= ', low matches'; }
109		210
110	} elsif ($score > 0.0) {	211	} elsif ($rule->{score} > 0.0) {
111	# positive score with more nonspams than spams? bad.	212	# positive score with more nonspams than spams? bad.
112	if ($tf =~ /nice/ && $so < 0.5 && $score > 0.5) {	213	if ($rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) {
113	$badrule = 'nice but +ve score';	214	$badrule = 'nice but +ve score';
114	}	215	}
115		216
116	if ($tf !~ /nice/ && $so < 0.5 && $score > 0.5) {	217	if (!$rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) {
117	if ($ffso > 0.5) {	218	if ($opt_f && $rule->{freq_fp} > $rule->{freq_fn}) {
118	$badrule = 'fp';	219	$badrule = 'fp';
119	} else {
120	# ignore, the FPs are overridden by other tests so it doesn't
121	# affect the overall results.
122	}	220	}
		221	# else {
		222	# ignore, the FPs are overridden by other tests so it doesn't
		223	# affect the overall results.
		224	# }
123	}	225	}
124		226
125	# low number of matches overall	227	# low number of matches overall
126	if ($spam < $LOW_MATCHES_PERCENT)	228	if ($rule->{spam_percent} < $LOW_MATCHES_PERCENT)
127	{ $badrule \|\|= ''; $badrule .= ', low matches'; }	229	{ $badrule \|\|= ''; $badrule .= ', low matches'; }
128		230
129	} elsif ($score == 0.0) {	231	} elsif ($rule->{score} == 0.0) {
130	$badrule = 'score is 0';	232	$badrule = 'score is 0';
131	}	233	}
132	}	234	}
133		235
134	if (defined $badrule) {	236	if (defined $badrule) {
135	$badrule =~ s/^, //; chomp;	237	$badrule =~ s/^, //;
136	$output{$badrule} .= $_ . " ($badrule)\n";	238	$output{$badrule} .= $rule->{name} . " ($badrule)\n";
137	}	239	}
138	}	240	}
139		241
Lines 156-337 Link Here
156	exit;	258	exit;
157		259
158		260
159	sub concat_rule_lang {
160	my $rule = shift;
161	my $lang = shift;
162
163	if (defined $lang && $lang ne '') {
164	return "[$lang]_$rule";
165	} else {
166	return $rule;
167	}
168	}
169
170	# note: do not use parse-rules-for-masses here, we need to do linting instead
171	# of your average parse
172	sub readrules {
173	my @files = <../rules/[0-9]*.cf>;
174	my $file;
175	%rulesfound = ();
176	%langs = ();
177	foreach $file (@files) {
178	open (IN, "<$file");
179	while (<IN>) {
180	s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;
181
182	# make all the foo-bar stuff foo_bar
183	1 while s/^(\S+)-/\1_/g;
184	1 while s/^(lang\s+\S+\s+\S+)-/\1_/g;
185
186	my $lang = '';
187	if (s/^lang\s+(\S+)\s+//) {
188	$lang = $1; $langs{$1} = 1;
189	}
190
191	if (/^(header\|rawbody\|body\|full\|uri\|meta)\s+(\S+)\s+/) {
192	$rulesfound{$2} = 1;
193	$rulefile{$2} \|\|= $file;
194	$scorefile{$1} = $file;
195	$score{$2} \|\|= 1.0;
196	$tflags{$2} \|\|= '';
197	$descfile{$2} \|\|= $file; # a rule with no score or desc is OK
198	$description{$2}->{$lang} = undef;
199
200	if (/^body\s+\S+\s+eval:/) {
201	# ignored
202	} elsif (/^body\s+\S+\s+(.*)$/) {
203	my $re = $1;
204
205	# If there's a ( in a rule where it should be (?:, flag it.
206	# but ignore [abc(] ...
207	if ($re =~ /[^\\]\([^\?]/ && $re !~ /\[[^\]]*[^\\]\(/) {
208	print "warning: non-(?:...) capture in regexp in $file: $_\n";
209	}
210	if ($re =~ /\.[\*\+]/) {
211	print "warning: .* in regexp in $file: $_\n";
212	}
213	if ($re =~ /[^\\]\{(\d),?(\d?)\}/) {
214	if ($1 > 120 \|\| $2 > 120) {
215	print "warning: long .{n} in regexp in $file: $_\n";
216	}
217	}
218	}
219
220	} elsif (/^describe\s+(\S+)\s+(.?)\s$/) {
221	$rulesfound{$1} = 1;
222	$descfile{concat_rule_lang ($1, $lang)} \|\|= $file;
223	$descfile{$1} \|\|= $file;
224	$description{$1}->{$lang} = $2;
225	} elsif (/^tflags\s+(\S+)\s+(.+)$/) {
226	$rulesfound{$1} = 1;
227	$tflags{$1} = $2;
228	$tflagsfile{concat_rule_lang ($1, $lang)} = $file;
229	$tflagsfile{$1} = $file;
230	} elsif (/^score\s+(\S+)\s+(.+)$/) {
231	$rulesfound{$1} = 1;
232	$scorefile{concat_rule_lang ($1, $lang)} = $file;
233	$scorefile{$1} = $file;
234	$score{$1} = $2;
235	} elsif (/^(clear_report_template\|clear_spamtrap_template\|report\|spamtrap\|
236	clear_terse_report_template\|terse_report\|
237	required_score\|ok_locales\|ok_languages\|test\|lang\|
238	spamphrase\|whitelist_from\|require_version\|
239	clear_unsafe_report_template\|unsafe_report\|
240	(?:bayes_)?auto_learn_threshold_nonspam\|(?:bayes_)?auto_learn_threshold_spam\|
241	(?:bayes_)?auto_learn
242	)/x) {
243	next;
244	} else {
245	print "warning: unknown rule in $file: $_\n";
246	}
247	}
248	close IN;
249	}
250	@langsfound = sort keys %langs;
251	@rulesfound = sort keys %rulesfound;
252	}
253
254	sub lintrules {
255	my %possible_renames = ();
256
257	foreach my $rule (@rulesfound) {
258	my $match = $rule;
259	$match =~ s/_\d+[^_]+$//gs; # trim e.g. "_20K"
260	$match =~ s/[^A-Z]+//gs; # trim numbers etc.
261
262	if (defined ($rulefile{$rule}) && $possible_renames{$match} !~ / \Q$rule\E\b/) {
263	$possible_renames{$match} .= " ".$rule;
264	}
265	$possible_rename_matches{$rule} = $match;
266	}
267
268	foreach my $lang ('', @langsfound) {
269	foreach my $baserule (@rulesfound) {
270	next if ( $baserule =~ /^__/ \|\| $baserule =~ /^T_/ );
271
272	my $rule = concat_rule_lang ($baserule, $lang);
273	my $f = $descfile{$rule};
274	my $warned = '';
275
276	if (defined $f && !defined ($rulefile{$rule})
277	&& !defined ($rulefile{$baserule}))
278	{
279	print "warning: $baserule has description, but no rule: $f\n";
280	$warned .= ' lamedesc';
281	}
282
283	# Check our convention for rule length
284	if ( (($lang ne '' && defined($rulefile{$rule})) \|\| ($lang eq '' && defined ($rulefile{$baserule}))) && length $baserule > 22 ) {
285	print "warning: $baserule has a name longer than 22 chars: $f\n";
286	}
287	# Check our convention for rule length
288	if ( (($lang ne '' && defined($rulefile{$rule})) \|\| ($lang eq '' && defined ($rulefile{$baserule}))) && defined $description{$baserule}->{$lang} && length $description{$baserule}->{$lang} > 50 ) {
289	print "warning: $baserule has a description longer than 50 chars: $f\n";
290	}
291
292	# lang rule trumps normal rule
293	$f = $rulefile{$rule} \|\| $rulefile{$baserule};
294	# if the rule exists, and the language/rule description doesn't exist ...
295	if ( defined $f && !defined $description{$baserule}->{$lang} )
296	{
297	print "warning: $baserule exists, ",( $lang ne '' ? "lang $lang, " : "" ),"but has no description: $f\n";
298	$warned .= ' lamedesc';
299	}
300
301
302	$f = $scorefile{$rule};
303	if (defined $f && !defined ($rulefile{$rule})
304	&& !defined ($rulefile{$baserule}))
305	{
306	print "warning: $baserule has score, but no rule: $f\n";
307	$warned .= ' lamescore';
308	}
309
310	my $r = $possible_rename_matches{$rule};
311	if ($warned ne '' && defined $r) {
312	my @matches = split (' ', $possible_renames{$r});
313	if (scalar @matches != 0) {
314	my $text = '';
315
316	# now try and figure out "nearby" rules with no description/score
317	foreach my $baser (@matches) {
318	my $blang;
319	if ($descfile{$rule} =~ /text_(\S\S)\./) {
320	$blang = $1;
321	}
322	my $r = concat_rule_lang ($baser, $blang);
323	#warn "$r $descfile{$r} $descfile{$baser}";
324	next if ($warned =~ /lamedesc/ && (defined $descfile{$r}));
325	next if ($warned =~ /lamescore/ && (defined $scorefile{$r}));
326	$text .= " $baser";
327	}
328
329	if ($text ne '') {
330	print "warning: (possible renamed rule? $text)\n";
331	}
332	}
333	}
334	}
335	}
336	}
337

Lines 16-112 Link Here

(-)mass-check (-108 / +226 lines)
16	# limitations under the License.	16	# limitations under the License.
17	# </@LICENSE>	17	# </@LICENSE>
18		18
19	sub usage {	19	=head1 NAME
20	die <<ENDOFUSAGE;
21	usage: mass-check [options] target ...
22
23	-c=file set configuration/rules directory
24	-p=dir set user-prefs directory
25	-f=file read list of targets from <file>
26	-j=jobs specify the number of processes to run simultaneously
27	--net turn on network checks!
28	--mid report Message-ID from each message
29	--debug report debugging information
30	--progress show progress updates during check
31	--rewrite=OUT save rewritten message to OUT (default is /tmp/out)
32	--showdots print a dot for each scanned message
33	--rules=RE Only test rules matching the given regexp RE
34	--restart=N restart all of the children after processing N messages
35	--deencap=RE Extract SpamAssassin-encapsulated spam mails only if they
36	were encapsulated by servers matching the regexp RE
37	(default = extract all SpamAssassin-encapsulated mails)
38
39	log options
40	-o write all logs to stdout
41	--loghits log the text hit for patterns (useful for debugging)
42	--loguris log the URIs found
43	--hamlog=log use <log> as ham log ('ham.log' is default)
44	--spamlog=log use <log> as spam log ('spam.log' is default)
45
46	message selection options
47	-n no date sorting or spam/ham interleaving
48	--after=N only test mails received after time_t N (negative values
49	are an offset from current time, e.g. -86400 = last day)
50	or after date as parsed by Time::ParseDate (e.g. '-6 months')
51	--before=N same as --after, except received times are before time_t N
52	--all don't skip big messages
53	--head=N only check first N ham and N spam (N messages if -n used)
54	--tail=N only check last N ham and N spam (N messages if -n used)
55
56	simple target options (implies -o and no ham/spam classification)
57	--dir subsequent targets are directories
58	--file subsequent targets are files in RFC 822 format
59	--mbox subsequent targets are mbox files
60	--mbx subsequent targets are mbx files
61
62	Just left over functions we should remove at some point:
63	--bayes report score from Bayesian classifier
64
65	non-option arguments are used as target names (mail files and folders),
66	the target format is: <class>:<format>:<location>
67	<class> is "spam" or "ham"
68	<format> is "dir", "file", "mbx", or "mbox"
69	<location> is a file or directory name. globbing of ~ and * is supported
70		20
71	ENDOFUSAGE	21	mass-check - Generates SpamAssassin scores and results for large
72	}	22	amounts of mail
73		23
		24	=head1 SYNOPSIS
		25
		26	mass-check [options] class:format:location ...
		27	mass-check [options] {--dir \| --file \| --mbox} target ...
		28	mass-check [options] -f file
		29
		30	Options:
		31	-f=file read list of targets from <file>
		32	-j=jobs specify the number of processes to run simultaneously
		33	--net turn on network checks!
		34	--mid report Message-ID from each message
		35	--debug report debugging information
		36	--progress show progress updates during check
		37	--rewrite=OUT save rewritten message to OUT (default is /tmp/out)
		38	--showdots print a dot for each scanned message
		39	--rules=RE Only test rules matching the given regexp RE
		40	--restart=N restart all of the children after processing N messages
		41
		42	SpamAssassin options
		43	-c=dir set configuration/rules directory
		44	-p=file set user preferences file (default: none)
		45	-s=dir set site rules configuration directory
		46	-u=dir set user-state directory
		47	--dist assumes the script is being run from the masses/ dir of
		48	the unpacked tarball, and makes appropriate guesses for
		49	-p and -c
		50	--deencap=RE Extract SpamAssassin-encapsulated spam mails only if they
		51	were encapsulated by servers matching the regexp RE
		52	(default = extract all SpamAssassin-encapsulated mails)
		53
		54	log options
		55	-o write all logs to stdout
		56	--loghits log the text hit for patterns (useful for debugging)
		57	--loguris log the URIs found
		58	--log=file log to <file> (masses.log is default)
		59
		60	message selection options
		61	-n no date sorting or spam/ham interleaving
		62	--after=N only test mails received after time_t N (negative values
		63	are an offset from current time, e.g. -86400 = last day)
		64	or after date as parsed by Time::ParseDate (e.g. '-6 months')
		65	--before=N same as --after, except received times are before time_t N
		66	--all don't skip big messages
		67	--head=N only check first N ham and N spam (N messages if -n used)
		68	--tail=N only check last N ham and N spam (N messages if -n used)
		69
		70	simple target options (implies -o and no ham/spam classification)
		71	--dir subsequent targets are directories
		72	--file subsequent targets are files in RFC 822 format
		73	--mbox subsequent targets are mbox files
		74	--mbx subsequent targets are mbx files
		75
		76	Just left over functions we should remove at some point:
		77	--bayes report score from Bayesian classifier
		78	--hamlog=log use <log> as ham log ('ham.log' is default)
		79	--spamlog=log use <log> as spam log ('spam.log' is default)
		80
		81	=head1 DESCRIPTION
		82
		83	B<mass-check> is designed to assist with rule development and
		84	generation of SpamAssassin scored. It reads in mail from the
		85	location(s) specified on the command line (in the first form above),
		86	given in the form I<class:format:location>, where I<class> is either
		87	"spam" or "ham" (non-spam), I<format> is one of "dir" (Maildirs, MH,
88	etc), "file", "mbox" (mboxes can be gzipped) or "mbx".
89
90	B<mass-check> will analyze each message using SpamAssassin and
91	generate one-line of output per message, (by default to masses.log) in
92	the following format:
93
94	{s\|h} {s\|h} score filename tests-hit
95
96	The first field is the message's class as given on the command line
97	(ham or spam). The second is the message's class as determined by
98	SpamAssassin. The third is the message's score, as determined by
99	SpamAssassin. The fourth field contains the message's filename; for
100	mboxes, this contains the filename and the byte offset from the
101	beginning of the file separated by a period. The last field contains a
102	list of all the tests the message hit separated by commas.
103
104	If you want to run this on the currently installed version of
105	SpamAssassin's rules for sitewide use, make sure your user_prefs file
106	contains no rules.
107
108	=head1 BUGS
109
110	Please report bugs to http://bugzilla.spamassassin.org/
111
112	=head1 SEE ALSO
113
114	L<hit-frequencies(1)>, L<logs-to-c(1)>, L<Mail::SpamAssassin::Masses(3)>,
115	L<perceptron(1)>
116
117	=cut
118
74	###########################################################################	119	###########################################################################
75		120
76	use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all $opt_bayes	121	use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all
77	$opt_debug $opt_format $opt_hamlog $opt_head $opt_loghits	122	$opt_bayes $opt_before $opt_debug $opt_dist $opt_format
78	$opt_mid $opt_mh $opt_ms $opt_net $opt_nosort $opt_progress	123	$opt_hamlog $opt_head $opt_log $opt_loghits $opt_mid
79	$opt_showdots $opt_spamlog $opt_tail $opt_rules $opt_restart	124	$opt_mh $opt_ms $opt_net $opt_nosort $opt_p $opt_progress
80	$opt_loguris $opt_after $opt_before $opt_rewrite $opt_deencap);	125	$opt_s $opt_showdots $opt_spamlog $opt_tail $opt_rules
		126	$opt_restart $opt_loguris $opt_after $opt_rewrite $opt_u
		127	$opt_deencap);
81		128
82	use FindBin;	129	use FindBin;
83	use lib "$FindBin::Bin/../lib";	130	use lib "$FindBin::Bin/../lib";
84	eval "use bytes";	131	eval "use bytes";
85	use Mail::SpamAssassin::ArchiveIterator;	132	use Mail::SpamAssassin::ArchiveIterator;
86	use Mail::SpamAssassin;	133	use Mail::SpamAssassin;
87	use Getopt::Long;	134	use Getopt::Long qw(:config bundling auto_help);
88	use POSIX qw(strftime);	135	use POSIX qw(strftime);
89	use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };	136	use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };
		137	use strict; # Why wasn't this on?
90	use Config;	138	use Config;
91		139
92	# default settings	140	# default settings
93	$opt_c = "$FindBin::Bin/../rules";	141
94	$opt_p = "$FindBin::Bin/spamassassin";
95	$opt_j = 1;	142	$opt_j = 1;
96	$opt_net = 0;	143	$opt_net = 0;
97	$opt_hamlog = "ham.log";	144	$opt_log = "masses.log";
98	$opt_spamlog = "spam.log";
99		145
100	GetOptions("c=s", "p=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug",	146	GetOptions("c=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug",
101	"hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net",	147	"deencap=s", "dist!", "hamlog=s", "head=i", "log=s",
102	"progress", "rewrite:s", "showdots", "spamlog=s", "tail=i",	148	"loghits", "mh", "mid", "ms", "net", "p=s", "progress",
103	"rules=s", "restart=i", "after=s", "before=s", "loguris", "deencap=s",	149	"rewrite:s", "s=s", "showdots", "spamlog=s", "tail=i",
		150	"rules=s", "restart=i", "u=s", "after=s", "loguris",
104	"dir" => sub { $opt_format = "dir"; },	151	"dir" => sub { $opt_format = "dir"; },
105	"file" => sub { $opt_format = "file"; },	152	"file" => sub {$opt_format = "file"; },
106	"mbox" => sub { $opt_format = "mbox"; },	153	"mbox" => sub { $opt_format = "mbox"; },
107	"mbx" => sub { $opt_format = "mbx"; },	154	"mbx" => sub { $opt_format = "mbx"; },
108	'<>' => \&target) or usage();	155	'<>' => \&target) or usage();
109		156
		157	if ($opt_hamlog \|\| $opt_spamlog) { # Old style logging
		158	$opt_hamlog \|\|= "ham.log";
		159	$opt_spamlog \|\|= "spam.log";
		160	}
		161
		162	my @targets;
		163
110	if ($opt_f) {	164	if ($opt_f) {
111	open(F, $opt_f) \|\| die $!;	165	open(F, $opt_f) \|\| die $!;
112	push(@targets, map { chomp; $_ } <F>);	166	push(@targets, map { chomp; $_ } <F>);
Lines 115-159 Link Here
115		169
116	if (scalar @targets == 0) { usage(); }	170	if (scalar @targets == 0) { usage(); }
117		171
118	#if ($opt_ms) {	172	# Auto-detect --dist option
119	#find_missed($opt_spamlog);	173	if (!defined $opt_dist) {
120	#}	174	if (-f "$FindBin::Bin/../spamassassin.raw") {
121	#elsif ($opt_mh) {	175	warn "Automatically using --dist. Assuming you are running from the unpacked tarball. Use --no-dist to override.";
122	#find_missed($opt_hamlog);	176	$opt_dist = 1;
123	#}	177	}
		178	}
124		179
125	$spamtest = new Mail::SpamAssassin ({	180	my $local_rules_dir;
126	'debug' => $opt_debug,
127	'rules_filename' => $opt_c,
128	'userprefs_filename' => "$opt_p/user_prefs",
129	'site_rules_filename' => "$opt_p/local.cf",
130	'userstate_dir' => "$opt_p",
131	'save_pattern_hits' => $opt_loghits,
132	'dont_copy_prefs' => 1,
133	'local_tests_only' => $opt_net ? 0 : 1,
134	'only_these_rules' => $opt_rules,
135	'ignore_safety_expire_timeout' => 1,
136	PREFIX => '',
137	DEF_RULES_DIR => $opt_c,
138	LOCAL_RULES_DIR => '',
139	});
140		181
		182	if ($opt_dist) { # Set defaults
		183	$opt_c \|\|= "$FindBin::Bin/../rules";
		184	$opt_p \|\|= "$FindBin::Bin/mass-check.cf";
		185	$opt_u \|\|= "$FindBin::Bin/spamassassin";
		186	$opt_s \|\|= "$FindBin::Bin/spamassassin";
		187	$local_rules_dir = '';
		188	}
		189	else {
		190	if(!$opt_u) {
		191	# Assuming this is OK, since mass-check isnt supported on windows, is it?
		192	# Also, should there be some check to make sure that previous mass-check stuff isn't in there?
		193	# AFAICT, there isn't otherwise....
		194	if ( -d "${ENV{HOME}}/.spamassassin" ) {
		195	$opt_u = "${ENV{HOME}}/.spamassassin/mass-check";
		196	mkdir $opt_u, 0700 if (! -d $opt_u);
		197	}
		198	}
		199
		200	# Leave the rest to SA, we'll get it afterwards
		201
		202	}
		203
		204	my $spamtest = new Mail::SpamAssassin ({
		205	'debug' => $opt_debug,
		206	'rules_filename' => $opt_c,
		207	'userprefs_filename' => $opt_p,
		208	'site_rules_filename' => $opt_s,
		209	'userstate_dir' => $opt_u,
		210	'save_pattern_hits' => $opt_loghits,
		211	'dont_copy_prefs' => 1,
		212	'local_tests_only' => $opt_net ? 0 : 1,
		213	'only_these_rules' => $opt_rules,
		214	'ignore_safety_expire_timeout' => 1,
		215	DEF_RULES_DIR => $opt_c,
		216	LOCAL_RULES_DIR => $local_rules_dir,
		217	});
		218
141	$spamtest->compile_now(1);	219	$spamtest->compile_now(1);
142	$spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf");	220	if ($opt_dist) {
		221	$spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf");
		222	}
143		223
144	my $who = `id -un 2>/dev/null`; chomp $who;	224	my $who = `id -un 2>/dev/null`; chomp $who;
145	my $where = `uname -n 2>/dev/null`; chomp $where;	225	my $where = `uname -n 2>/dev/null`; chomp $where;
146	my $when = `date -u`; chomp $when;	226	my $when = `date -u`; chomp $when;
147	my $revision = "unknown";	227	my $revision;
148	if (open(TESTING, "$opt_c/70_testing.cf")) {	228
149	chomp($revision = <TESTING>);	229	if ($opt_dist) {
150	$revision =~ s/.\$Rev:\s(\S+).*/$1/;	230	my $rev = "unknown";
151	close(TESTING);	231	if (open(TESTING, "$opt_c/70_testing.cf")) {
		232	chomp($rev = <TESTING>);
		233	$rev =~ s/.\$Rev:\s(\S+).*/$1/;
		234	close(TESTING);
		235	}
		236	$revision = "SVN revision: $rev";
152	}	237	}
		238	else {
		239	$revision = "Local";
		240	}
		241
153	my $log_header = "# mass-check results from $who\@$where, on $when\n" .	242	my $log_header = "# mass-check results from $who\@$where, on $when\n" .
154	"# M:SA version ".$spamtest->Version()."\n" .	243	"# M:SA version ".$spamtest->Version()."\n" .
155	"# SVN revision: $revision\n" .	244	"# $revision\n" .
156	"# Perl version: $] on $Config{archname}\n";	245	"# Perl version: $] on $Config{archname}\n";
		246
		247	if (!$opt_dist) {
		248	my @paths = ( $spamtest->{rules_filename}, $spamtest->{site_rules_filename}, $spamtest->{userprefs_filename} );
		249	$log_header .= "# Using configuration:\n";
		250	foreach my $file (@paths) {
		251	$log_header .= "# $file\n";
		252	}
		253	}
		254
157	my $host = $ENV{'HOSTNAME'} \|\| $ENV{'HOST'} \|\| `hostname` \|\| 'localhost';	255	my $host = $ENV{'HOSTNAME'} \|\| $ENV{'HOST'} \|\| `hostname` \|\| 'localhost';
158	chomp $host;	256	chomp $host;
159		257
Lines 222-228 Link Here
222	autoflush STDOUT 1;	320	autoflush STDOUT 1;
223	print STDOUT $log_header;	321	print STDOUT $log_header;
224	}	322	}
225	else {	323	elsif ($opt_hamlog \|\| $opt_spamlog) {
226	open(HAM, "> $opt_hamlog");	324	open(HAM, "> $opt_hamlog");
227	open(SPAM, "> $opt_spamlog");	325	open(SPAM, "> $opt_spamlog");
228	autoflush HAM 1;	326	autoflush HAM 1;
Lines 230-235 Link Here
230	print HAM $log_header;	328	print HAM $log_header;
231	print SPAM $log_header;	329	print SPAM $log_header;
232	}	330	}
		331	else {
		332	open(OUT, "> $opt_log");
		333	autoflush OUT 1;
		334	print OUT $log_header;
		335	}
233	$init_results = 1;	336	$init_results = 1;
234	}	337	}
235		338
Lines 239-263 Link Here
239	# don't open results files until we get here to avoid overwriting files	342	# don't open results files until we get here to avoid overwriting files
240	&init_results if !$init_results;	343	&init_results if !$init_results;
241		344
242	if ($class eq "s") {	345	if ($opt_o) {
243	if ($opt_o) { print STDOUT $result; } else { print SPAM $result; }	346	print STDOUT $result;
244	$spam_count++;
245	}	347	}
246	elsif ($class eq "h") {	348	elsif ($opt_spamlog \|\| $opt_hamlog) {
247	if ($opt_o) { print STDOUT $result; } else { print HAM $result; }	349	if ($class eq "s") {
248	$ham_count++;	350	print SPAM $result;
		351	} else {
		352	print HAM $result;
		353	}
249	}	354	}
		355	else {
		356	print OUT $result;
		357	}
250		358
251	$total_count++;	359	$total_count++;
252	#warn ">> result: $total_count $class $time\n";	360	#warn ">> result: $total_count $class $time\n";
253		361
254	if ($opt_progress) {	362	if ($opt_progress) {
		363	if ($class eq "s") {
		364	$spam_count++;
		365	}
		366	else {
		367	$ham_count++;
		368	}
255	progress($time);	369	progress($time);
256	}	370	}
257	}	371	}
258		372
259	sub wanted {	373	sub wanted {
260	my (undef, $id, $time, $dataref) = @_;	374	my ($class, $id, $time, $dataref) = @_;
261	my $out;	375	my $out;
262		376
263	my $ma = $spamtest->parse($dataref, 1);	377	my $ma = $spamtest->parse($dataref, 1);
Lines 308-325 Link Here
308	push(@extra, "mid=$mid");	422	push(@extra, "mid=$mid");
309	}	423	}
310		424
311	my $yorn;	425	my $result;
312	my $score;	426	my $score;
313	my $tests;	427	my $tests;
314	my $extra;	428	my $extra;
315		429
316	if ($opt_loguris) {	430	if ($opt_loguris) {
317	$yorn = '.';	431	$result = '.';
318	$score = 0;	432	$score = 0;
319	$tests = join(" ", sort @uris);	433	$tests = join(" ", sort @uris);
320	$extra = '';	434	$extra = '';
321	} else {	435	} else {
322	$yorn = $status->is_spam() ? 'Y' : '.';	436	if ($status->is_spam()) {
		437	$result = "s";
		438	} else {
		439	$result = "h";
		440	}
323	$score = $status->get_score();	441	$score = $status->get_score();
324	$tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit())));	442	$tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit())));
325	$extra = join(",", @extra);	443	$extra = join(",", @extra);
Lines 333-339 Link Here
333		451
334	$id =~ s/\s/_/g;	452	$id =~ s/\s/_/g;
335		453
336	$out .= sprintf("%s %2d %s %s %s\n", $yorn, $score, $id, $tests, $extra);	454	$out .= sprintf("%s %s %05.2f %s %s %s\n", $class, $result, $score, $id, $tests, $extra);
337		455
338	if ($tests =~ /MICROSOFT_EXECUTABLE\|MIME_SUSPECT_NAME/) {	456	if ($tests =~ /MICROSOFT_EXECUTABLE\|MIME_SUSPECT_NAME/) {
339	$out .= logkilled($ma, $id, "possible virus");	457	$out .= logkilled($ma, $id, "possible virus");

Lines 52-62 Link Here

(-)README (-10 / +10 lines)
52	This script is used to perform "mass checks" of a set of mailboxes, Cyrus	52	This script is used to perform "mass checks" of a set of mailboxes, Cyrus
53	folders, and/or MH mail spools. It generates summary lines like this:	53	folders, and/or MH mail spools. It generates summary lines like this:
54		54
55	Y 7 /home/jm/Mail/Sapm/1382 SUBJ_ALL_CAPS,SUPERLONG_LINE,SUBJ_FULL_OF_8BITS	55	s s 07.22 /home/jm/Mail/Sapm/1382 SUBJ_ALL_CAPS,SUPERLONG_LINE,SUBJ_FULL_OF_8BITS
56		56
57	or for mailboxes,	57	or for mailboxes,
58		58
59	. 1 /path/to/mbox:<5.1.0.14.2.20011004073932.05f4fd28@localhost> TRACKER_ID,BALANCE_FOR_LONG	59	h h 01.32 /path/to/mbox:<5.1.0.14.2.20011004073932.05f4fd28@localhost> TRACKER_ID,BALANCE_FOR_LONG
60		60
61	listing the path to the message or its message ID, its score, and the tests	61	listing the path to the message or its message ID, its score, and the tests
62	that triggered on that mail.	62	that triggered on that mail.
Lines 65-87 Link Here
65	get good hits with few false positives, etc., and re-score the tests to	65	get good hits with few false positives, etc., and re-score the tests to
66	optimise the ratio.	66	optimise the ratio.
67		67
68	This script relies on the spamassassin distribution directory living in "..".	68	If given the --dist option, this script relies on the spamassassin
		69	distribution directory living in "..". If this script is not in the
		70	distribution directory, it will generate logs based on the site-wide
		71	rules, as well as personal rules.
69		72
70		73
71	logs-to-c :	74	logs-to-c :
72		75
73	Takes the "spam.log" and "nonspam.log" files and converts them into C	76	Takes the "masses.log" file and converts them into C source files
74	source files and simplified data files for use by the C score optimization	77	and simplified data files for use by the C score optimization
75	algorithm. (Called by "make" when you build the perceptron, so generally	78	algorithm. (Called by "make" when you build the perceptron, so
76	you won't need to run it yourself.)	79	generally you won't need to run it yourself.)
77		80
78
79	hit-frequencies :	81	hit-frequencies :
80		82
81	Analyses the log files and computes how often each test hits, overall,	83	Analyses the log files and computes how often each test hits, overall,
82	for spam mails and for non-spam.	84	for spam mails and for non-spam.
83		85
84
85	mk-baseline-results :	86	mk-baseline-results :
86		87
87	Compute results for the baseline scores (read from ../rules/*). If you	88	Compute results for the baseline scores (read from ../rules/*). If you
Lines 91-97 Link Here
91	It will output statistics on the current ruleset to ../rules/STATISTICS.txt,	92	It will output statistics on the current ruleset to ../rules/STATISTICS.txt,
92	suitable for a release build of SpamAssassin.	93	suitable for a release build of SpamAssassin.
93		94
94
95	perceptron.c :	95	perceptron.c :
96		96
97	Perceptron learner by Henry Stern. See "README.perceptron" for details.	97	Perceptron learner by Henry Stern. See "README.perceptron" for details.

Lines 1-3 Link Here

(-)fp-fn-statistics (-2 / +145 lines)
1	#!/bin/sh	1	#!/usr/bin/perl -w
		2	#
		3	# <@LICENSE>
		4	# Copyright 2004 Apache Software Foundation
		5	#
		6	# Licensed under the Apache License, Version 2.0 (the "License");
		7	# you may not use this file except in compliance with the License.
		8	# You may obtain a copy of the License at
		9	#
		10	# http://www.apache.org/licenses/LICENSE-2.0
		11	#
		12	# Unless required by applicable law or agreed to in writing, software
		13	# distributed under the License is distributed on an "AS IS" BASIS,
		14	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		15	# See the License for the specific language governing permissions and
		16	# limitations under the License.
		17	# </@LICENSE>
		18
		19
		20	use FindBin;
		21	use lib "$FindBin::Bin/../lib";
		22	use Mail::SpamAssassin::Masses;
		23	use Getopt::Long;
		24	use strict;
		25	use warnings;
2		26
3	exec ./logs-to-c --count $*	27	use vars qw{$opt_c $opt_l $opt_s $opt_h $opt_t $opt_lambda};
		28
		29	GetOptions("c\|cffile=s@" => \$opt_c, "l\|logfile=s" => \$opt_l,
		30	"s\|scoreset=i" => \$opt_s, "h\|help" => \$opt_h, "t\|threshold" =>
		31	\$opt_t, "lambda" => \$opt_lambda);
		32
		33
		34	# Need to add rule for scores file
		35	sub usage {
		36	die "logs-to-c [-c rules dir] [-l log file] [-s SS] [-t]
		37
		38	-c,--cffile p use p as rules directory
		39	-s,--scoreset SS use scoreset SS
		40	-l,--log log read logs from log instead of from masses.log
		41	-t,--threshold n use a threshold of n (default: 5)
		42	--lambda=n use a lambda value of n (default: 5)
		43	";
		44	}
		45
		46
		47	usage() if $opt_h;
		48
		49	$opt_c \|\|= "$FindBin::Bin/../rules";
		50	$opt_t = (defined($opt_t) ? $opt_t : 5);
		51	$opt_s \|\|= 0; #\|
		52	$opt_l \|\|= "masses.log";
		53	$opt_lambda \|\|= 5;
		54
		55	print "$opt_c used for config";
		56
		57	my $nybias = 10;
		58
		59
		60	my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
		61	scoreset => $opt_s, # ,,
		62	logfile => $opt_l});
		63
		64	$masses->readlogs();
		65
		66	my $logs = $masses->get_logs();
		67
		68	my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore) = (0,0,0,0,0,0,0,0);
		69
		70	my $num_spam = $masses->get_num_spam();
		71	my $num_ham = $masses->get_num_ham();
		72	my $num_logs = $num_spam + $num_ham;
		73
		74	my $count = 0;
		75
		76	my $score;
		77
		78	foreach my $log (@$logs) {
		79
		80	$score = 0;
		81	foreach my $test (@{$log->{tests_hit}}) {
		82
		83	next if ($test->{issubrule});
		84	next if (!$test->{score});
		85
		86	$score += $test->{score};
		87
		88	}
		89
		90	if ($score >= $opt_t) {
		91	if ($log->{isspam}) {
92	$ga_yy++;
93	$yyscore += $score;
94	}
95	else {
96	$ga_ny++;
97	$nyscore += $score;
98	}
99	} else {
100	if ($log->{isspam}) {
101	$ga_yn++;
102	$ynscore += $score;
103	}
104	else {
105	$ga_nn++;
106	$nnscore += $score;
107	}
108	}
109	}
110
111	$nybias = $nybias * ($num_spam / $num_ham);
112
113	my $fprate = ($ga_ny / $num_logs) * 100.0;
114	my $fnrate = ($ga_yn / $num_logs) * 100.0;
115
116	printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_t);
117	printf "# Correctly non-spam: %6d %4.2f%% (%4.2f%% of non-spam corpus)\n", $ga_nn,
118	($ga_nn / $num_logs) * 100.0, ($ga_nn / $num_ham) * 100.0;
119	printf "# Correctly spam: %6d %4.2f%% (%4.2f%% of spam corpus)\n" , $ga_yy,
120	($ga_yy / $num_logs) * 100.0, ($ga_yy / $num_spam) * 100.0;
121	printf "# False positives: %6d %4.2f%% (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny,
122	$fprate, ($ga_ny / $num_ham) * 100.0, $nyscore*$nybias;
123	printf "# False negatives: %6d %4.2f%% (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn,
124	$fnrate, ($ga_yn / $num_spam) * 100.0, $ynscore;
125
126	# convert to the TCR metrics used in the published lit
127	my $nspamspam = $ga_yy;
128	my $nspamlegit = $ga_yn;
129	my $nlegitspam = $ga_ny;
130	my $nlegitlegit = $ga_yn;
131	my $nlegit = $num_ham;
132	my $nspam = $num_spam;
133
134	my $werr = ($opt_lambda * $nlegitspam + $nspamlegit)
135	/ ($opt_lambda * $nlegit + $nspam);
136
137	my $werr_base = $nspam
138	/ ($opt_lambda * $nlegit + $nspam);
139
140	$werr \|\|= 0.000001; # avoid / by 0
141	my $tcr = $werr_base / $werr;
142
143	my $sr = ($nspamspam / $nspam) * 100.0;
144	my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
145	printf "# TCR: %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%% FP: %3.2f%% FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate;
146

Lines 16-272 Link Here

(-)logs-to-c (-346 / +212 lines)
16	# limitations under the License.	16	# limitations under the License.
17	# </@LICENSE>	17	# </@LICENSE>
18		18
19	use Getopt::Long;	19	=head1 NAME
20	use vars qw($opt_cffile $opt_count $opt_lambda $opt_threshold
21	$opt_spam $opt_nonspam);
22		20
23	GetOptions("cffile=s", "count", "lambda=f", "threshold=f", "spam=s", "nonspam=s", "scoreset=i");	21	logs-to-c - Convert a mass-check log into perceptron format
24	my $argcffile = $opt_cffile;
25		22
26	my $justcount = 0;	23	=head1 SYNOPSIS
27	if ($opt_count) { $justcount = 1; }
28		24
29	my $threshold = 5;	25	logs-to-c [options]
30	if (defined $opt_threshold) { $threshold = $opt_threshold; }
31		26
32	$opt_spam \|\|= 'spam.log';	27	Options:
33	$opt_nonspam \|\|= 'ham.log';	28	-c,--cffile=path Use path as the rules directory
34	$opt_scoreset = 0 if ( !defined $opt_scoreset );	29	-s,--scoreset=n Use scoreset n
		30	-l,--logfile=file Read in file instead of masses.log
		31	-o,--outputdir Put output in the specified dir (default tmp/)
35		32
36	my $nybias = 10;	33	=head1 DESCRIPTION
37		34
38	# lambda value for TCR equation, indicating the "cost" of recovering	35	B<logs-to-c> will read the mass-check log F<masses.log> or as
39	# from an FP. The values are: 1 = tagged only, 9 = mailed back to	36	specified by the B<--logfile> option, and convert it into the format
40	# sender asking for token (TMDA style), 999 = deleted outright.	37	needed by the perceptron. This is a format that is simple for the
41	# We (SpamAssassin) use a default of 5, representing "moved to	38	perceptron to parse, but is not very readable to humans.
42	# infrequently-read folder".
43		39
44	my $lambda = 5;	40	By default, output will be put in the directory ./tmp/ unless another
45	if ($opt_lambda) { $lambda = $opt_lambda; }	41	directory is specified by the B<--outputdir> option. (Note: at the
		42	current time, this must be /tmp/ in order for the perceptron to
		43	compile properly.)
46		44
47	my %is_spam = ();	45	=head1 BUGS
48	my %tests_hit = ();
49	my %mutable_tests = ();
50		46
51	use vars qw(%rules %allrules);	47	Please report bugs to http://bugzilla.spamassassin.org/
52		48
53	readscores();	49	=head1 SEE ALSO
54		50
55	print "Reading per-message hit stat logs and scores...\n";	51	L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
56	my ($num_tests, $num_spam, $num_nonspam);
57	my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore);
58		52
59	readlogs();	53	=cut
60	read_ranges();
61		54
62	if ($justcount) {	55	use FindBin;
63	$nybias = $nybias*($num_spam / $num_nonspam);	56	use lib "$FindBin::Bin/../lib";
64	evaluate();	57	use Mail::SpamAssassin::Masses;
65	} else {	58	use Getopt::Long qw(:config bundling auto_help);
66	print "Writing logs and current scores as C code...\n";	59	use strict;
67	writescores_c();	60	use warnings;
		61
		62	use vars qw{$opt_c $opt_l $opt_s $opt_o};
		63
		64	GetOptions("c\|cffile=s@" => \$opt_c,
		65	"s\|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
		66	"l\|logfile=s" => \$opt_l,
		67	"o\|output=s" => \$opt_o);
		68
		69
		70	$opt_o \|\|= "./tmp/";
		71	if (!-d $opt_o) {
		72	mkdir $opt_o, 0777 or die "Can't mkdir $opt_o";
68	}	73	}
69	exit 0;
70		74
		75	if (!$opt_c \|\| !scalar(@$opt_c)) {
		76	# Try to read this in from the log, if possible
		77	open IN, $opt_l or die "Can't open $opt_l: $!";
		78	my $files = 0; # are we in the files section?
		79	while(<IN>) {
		80	if (!$files) {
		81	if (/^\# SVN revision:/) {
		82	$opt_c = [ "$FindBin::Bin/../rules" ];
		83	last;
		84	} elsif (/^\# Using configuration:$/) {
		85	$files = 1;
		86	}
		87	} elsif (/^\#\s+(.)\s$/) {
		88	push (@$opt_c, $1);
		89	} else {
		90	# All done!
		91	last;
		92	}
		93	}
71		94
72	sub readlogs {	95	foreach my $file (@$opt_c) {
73	my $count = 0;	96	die "Can't read $file" unless -r $file;
74	$num_spam = $num_nonspam = 0;	97	}
		98	}
75		99
76	if ($justcount) {	100	# ignore rules that are subrules -- we don't generate scores for them...
77	$ga_yy = $ga_ny = $ga_yn = $ga_nn = 0;
78	$yyscore = $ynscore = $nyscore = $nnscore = 0.0;
79	}
80		101
81	foreach my $file ($opt_spam, $opt_nonspam) {	102	# Note: this will cause a difference over the old logs-to-c since rank
82	open (IN, "<$file");	103	# is dependent on the frequencies of all rules, not just non-subrules
83		104
84	while (<IN>) {	105	my $greprules = sub { return 0 if $_[1]->{issubrule}; return 1; };
85	next if /^\#/;
86	next if /^$/;
87	if($_ !~ /^.\s+([-\d]+)\s+\S+\s*/) { warn "bad line: $_"; next; }
88	my $hits = $1;
89	#my $foo = $_;
90	$_ = $'; s/(?:bayes\|time)=\S+//; s/,,+/,/g; s/^\s+//; s/\s+$//;
91		106
92	my $score = 0;	107	$opt_s \|\|= 0; # \|
93	my @tests = ();
94	foreach my $tst (split (/,/, $_)) {
95	next if ($tst eq '');
96	if (!defined $scores{$tst}) {
97	#warn "unknown test in $file, ignored: $tst\n";
98	next;
99	}
100		108
101	# Make sure to skip any subrules!	109	my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
102	next if ( $allrules{$tst}->{issubrule} );	110	scoreset => $opt_s, # ,,
		111	logfile => $opt_l,
		112	greprules => $greprules });
103		113
104	if ($justcount) {	114	$masses->readlogs();
105	$score += $scores{$tst};	115	$masses->do_score_ranges();
106	} else {
107	push (@tests, $tst);
108	}
109	}
110		116
111	if (!$justcount) {	117	my $rules = $masses->get_rules_array();
112	$tests_hit{$count} = \@tests;	118	my $logs = $masses->get_logs();
113	}
114		119
115	if ($file eq $opt_spam) {	120	my @index_to_rule;
116	$num_spam++;	121	my $num_spam = $masses->get_num_spam();
117	if ($justcount) {	122	my $num_ham = $masses->get_num_ham();
118	if ($score >= $threshold) {
119	$ga_yy++; $yyscore += $score;
120	} else {
121	$ga_yn++; $ynscore += $score;
122	}
123	} else {
124	$is_spam{$count} = 1;
125	}
126	} else {
127	$num_nonspam++;
128	if ($justcount) {
129	if ($score >= $threshold) {
130	#print "$score -- $foo";
131	$ga_ny++; $nyscore += $score;
132	} else {
133	$ga_nn++; $nnscore += $score;
134	}
135	} else {
136	$is_spam{$count} = 0;
137	}
138	}
139	$count++;
140	}
141	close IN;
142	}
143	$num_tests = $count;
144	}
145		123
		124	# This is misleading -- num_tests is really num_msgs
		125	my $num_tests = $num_spam + $num_ham;
146		126
147	sub readscores {
148	if (!defined $argcffile) { $argcffile = "../rules"; }
149	print "Reading scores from \"$argcffile\"...\n";
150	system ("./parse-rules-for-masses -d \"$argcffile\" -s $opt_scoreset") and die;
151	require "./tmp/rules.pl";
152	%allrules = %rules; # ensure it stays global
153	}
154		127
		128	# Write logs and scores as C code
		129	writescores_c();
		130	writetests_c();
		131
		132
155	sub writescores_c {	133	sub writescores_c {
156	my $output = '';	134
157	my $size = 0;
158	my $mutable = 0;	135	my $mutable = 0;
159	my $i;	136	my $output = '';
		137	my $count = 0;
		138	my $score = 0;
160		139
161	# jm: now, score-ranges-from-freqs has tflags to work from, so	140	foreach my $rule (sort {($b->{ismutable} <=> $a->{ismutable}) \|\|
162	# it will always list all mutable tests.	141	($a->{name} cmp $b->{name}) } @$rules) {
163		142
164	@index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) \|\|	143	$score = $rule->{score};
165	($mutable_tests{$b} <=> $mutable_tests{$a}) \|\|
166	($a cmp $b)} (keys %scores);
167	my $max_hits_per_msg = 0;
168	for ($file = 0; $file < $num_tests; $file++) {
169	my(@hits) =
170	grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (@{$tests_hit{$file}});
171	if ((scalar(@hits)+1) > $max_hits_per_msg) {
172	$max_hits_per_msg = scalar(@hits)+1;
173	}
174	}
175		144
176	for ($i = 0; $i <= $#index_to_rule; $i++) {	145	# ignored rules (i.e. no scores)
177	my $name = $index_to_rule[$i];	146	next unless $score;
178	$rule_to_index{$name} = $i;
179		147
180	if ($ignored_rule{$name}) { next; }	148	# also ignore rules with score range 0
		149	next if (!$rule->{range_lo} && !$rule->{range_hi});
181		150
182	if ($mutable_tests{$name} == 0) {	151	# Set an index
183	$range_lo{$name} = $range_hi{$name} = $scores{$name};	152	$rule->{index} = $count;
184	} else {	153	$index_to_rule[$count] = $rule; # add the reference to the array
		154
		155	if ($rule->{ismutable}) {
185	$mutable++;	156	$mutable++;
186	if ($range_lo{$name} > $range_hi{$name}) {	157	if ($score > $rule->{range_hi}) {
187	($range_lo{$name},$range_hi{$name}) =	158	$score = $rule->{range_hi} - 0.001;
188	($range_hi{$name},$range_lo{$name});	159	} elsif ($score < $rule->{range_lo}) {
		160	$score = $rule->{range_lo} + 0.001;
189	}	161	}
190	#$range_lo{$name} \|\|= 0.1;
191	#$range_hi{$name} \|\|= 1.5;
192	}	162	}
		163	# These should all be set properly if not mutable
		164	# score = range_lo = range_hi
		165	else {
		166	warn "hi != lo for " . $rule->{name} . "!" if $rule->{range_lo} != $rule->{range_hi};
		167	$score = $rule->{range_hi} = $rule->{range_lo};
		168	}
193		169
194	$output .= ".".$i."\n".	170	$output .= "." . $count . "\n" .
195	"n".$name."\n".	171	"n" . $rule->{name} . "\n" .
196	"b".$scores{$name}."\n".	172	"b" . $score . "\n" .
197	"m".$mutable_tests{$name}."\n".	173	"m" . $rule->{ismutable} . "\n" .
198	"l".$range_lo{$name}."\n".	174	"l" . $rule->{range_lo} . "\n" .
199	"h".$range_hi{$name}."\n";	175	"h" . $rule->{range_hi} . "\n";
200	$size++;	176
		177	$count++;
		178
201	}	179	}
202		180
		181	# Output this
203		182
204	open (DAT, ">tmp/scores.data");	183	open (DAT, ">$opt_o/scores.data");
205	print DAT "N$size\n", "M$mutable\n", # informational only	184	print DAT "N$count\n", "M$mutable\n"; # informational
206	$output;	185	print DAT $output;
207	close DAT;	186	close DAT;
208		187
209	open (OUT, ">tmp/scores.h");	188	open (OUT, ">$opt_o/scores.h");
210	print OUT "	189	print OUT <<EOF;
211	#include <stdio.h>	190	#include <stdio.h>
212	#include <string.h>	191	#include <string.h>
213	#include <stdlib.h>	192	#include <stdlib.h>
214		193
215	int num_scores = $size;	194	int num_scores = $count;
216	int num_mutable = $mutable;	195	int num_mutable = $mutable;
217	unsigned char is_mutable[$size];	196	unsigned char is_mutable[$count];
218	double range_lo[$size];	197	double range_lo[$count];
219	double range_hi[$size];	198	double range_hi[$count];
220	double bestscores[$size];	199	double bestscores[$count];
221	char *score_names[$size];	200	char *score_names[$count];
222	double tmp_scores[$size][2];	201	double tmp_scores[$count][2];
223	unsigned char ny_hit[$mutable];	202	unsigned char ny_hit[$mutable];
224	unsigned char yn_hit[$mutable];	203	unsigned char yn_hit[$mutable];
225		204
226	double lookup[$mutable];	205	double lookup[$mutable];
227		206
228	/* readscores() is defined in tests.h */	207	/* readscores() is defined in tests.h */
		208	EOF
229		209
230	";
231	close OUT;	210	close OUT;
232		211
233	writetests_c($max_hits_per_msg); # make sure $rule_to_index is around
234	}	212	}
235		213
		214
236	sub writetests_c {	215	sub writetests_c {
237	my $max_hits_per_msg = $_[0];
238		216
239	my(%uniq_files) = ();	217	my $max_hits_per_msg = 0;
240	my(%count_keys) = ();	218	my @goodtests;
241	my(%file_key) = ();	219	my %uniq_logs;
		220	my $uniq_key;
242		221
243	my $file;	222	my $i = 0;
244		223
245	for ($file = 0; $file < $num_tests; $file++)	224	# This will "compress" the logs so that one log entry can have a
246	{	225	# "count" of n indicating it reprents n similar messages
247	my $uniq_key = $is_spam{$file} . " ";
248		226
249	my(@good_tests) =	227	foreach my $log (@$logs) {
250	grep {length($_) && (! $ignored_rule{$_}) &&
251	(defined($rule_to_index{$_}))} (@{ $tests_hit{$file} });
252		228
253	@good_tests = sort {$a <=> $b} (map {$rule_to_index{$_}} (@good_tests));	229	(@goodtests) = grep {exists($_->{index})} (@{$log->{tests_hit}});
		230	@goodtests = sort {$a <=> $b} map {$_->{index}} @goodtests;
254		231
255	$uniq_key .= join(" ",@good_tests);	232	if($max_hits_per_msg < scalar(@goodtests)) {
		233	$max_hits_per_msg = scalar(@goodtests);
		234	}
256		235
257	if (exists($count_keys{$uniq_key})) {	236	$uniq_key = $log->{isspam} ? "s" : "";
258	$count_keys{$uniq_key}++;	237	$uniq_key .= join(" ", @goodtests);
		238
		239
		240	# The %count_keys hash's entries will be the log info for each unique log
		241	# $log->{count} is increased to indicate similar logs
		242
		243	if (exists($uniq_logs{$uniq_key})) {
		244	$uniq_logs{$uniq_key}->{count}++;
259	} else {	245	} else {
260	$count_keys{$uniq_key} = 1;	246	$uniq_logs{$uniq_key} = $log;
261	$file_key{$file} = $uniq_key;	247	$uniq_logs{$uniq_key}->{count} = 1;
262	$uniq_files{$file} = scalar(keys(%count_keys)) - 1;
263	}	248	}
		249
264	}	250	}
265		251
266	my $num_nondup = scalar(keys(%uniq_files));	252	my $num_nondup = scalar(keys %uniq_logs);
267		253
268	open (TOP, ">tmp/tests.h");	254	open TOP, ">$opt_o/tests.h";
269	print TOP "	255	print TOP <<EOF;
270	#include <stdio.h>	256	#include <stdio.h>
271	#include <string.h>	257	#include <string.h>
272	#include <stdlib.h>	258	#include <stdlib.h>
Lines 274-280 Link Here
274	int num_tests = $num_tests;	260	int num_tests = $num_tests;
275	int num_nondup = $num_nondup;	261	int num_nondup = $num_nondup;
276	int num_spam = $num_spam;	262	int num_spam = $num_spam;
277	int num_nonspam = $num_nonspam;	263	int num_nonspam = $num_ham;
278	int max_hits_per_msg = $max_hits_per_msg;	264	int max_hits_per_msg = $max_hits_per_msg;
279	unsigned char num_tests_hit[$num_nondup];	265	unsigned char num_tests_hit[$num_nondup];
280	unsigned char is_spam[$num_nondup];	266	unsigned char is_spam[$num_nondup];
Lines 282-477 Link Here
282	double scores[$num_nondup];	268	double scores[$num_nondup];
283	double tmp_total[$num_nondup];	269	double tmp_total[$num_nondup];
284	int tests_count[$num_nondup];	270	int tests_count[$num_nondup];
		271	EOF
285		272
286	";	273
287	$_ = join ('', <DATA>);	274	print TOP join('', <DATA>);
288	print TOP $_;
289	close TOP;	275	close TOP;
290		276
291	open (DAT, ">tmp/tests.data");
292		277
293	foreach $file (sort {$a <=> $b} (keys %uniq_files)) {	278	open (DAT, ">$opt_o/tests.data");
294	print DAT ".".$uniq_files{$file}."\n";
295		279
296	my $out = '';	280	my $out;
297	$out .= "s".$is_spam{$file}."\n";	281	my $base_score;
		282	my $num_tests_hit;
298		283
299	my $base_score = 0;	284	$i = 0;
300	my $num_tests_hit = 0;	285	foreach my $log (values %uniq_logs) {
301	foreach my $test (@{$tests_hit{$file}}) {	286	$out = '';
302	if ($test eq '') { next; }	287	$base_score = $num_tests_hit = 0;
303		288
304	if ($ignored_rule{$test}) {	289	print DAT "." . $i . "\n";
305	warn "ignored rule $test got a hit in $file!\n";	290
306	next;	291	$out .= "s" . ( ($log->{isspam})? 1 : 0 ) . "\n";
		292
		293	foreach my $test (@{$log->{tests_hit}}) {
		294	if (!$test->{score}) {
		295	# Don't really know why this happens, but the old logs-to-c
		296	#did it too
		297
		298	warn "ignored rule " . $test->{name} . " got a hit!";
		299	next;
307	}	300	}
308		301
309	if (!defined $rule_to_index{$test}) {	302	if (!$test->{range_lo} && !$test->{range_hi}) {
310	warn "test with no C index: $test\n";	303	# We ignored this rule
311	next;	304	next;
312	}	305	}
313		306
314	if ($mutable_tests{$test}) {	307	# debugging...
315	$num_tests_hit++;	308	if (!defined $test->{index}) {
316	$out .= "t".$rule_to_index{$test}."\n";	309	warn "test with no index";
317
318	if ($num_tests_hit >= $max_hits_per_msg) {
319	die "Need to increase \$max_hits_per_msg";
320	}	310	}
321	} else {
322	$base_score += $scores{$test};
323	}
324	}
325		311
326	$out .= "b" . $base_score . "\n"; # score to add in for non-mutable tests	312	if ($test->{ismutable}) {
327	$out .= "c" . $count_keys{$file_key{$file}} . "\n";	313	$num_tests_hit++;
		314	$out .= "t".$test->{index}."\n";
328		315
329	print DAT "n".$num_tests_hit."\n".$out;	316	if ($num_tests_hit >= $max_hits_per_msg) {
330	}	317	die "\$max_hits_per_msg not big enough!";
331	close DAT;	318	}
332	}
333		319
334	sub read_ranges {	320	}
335	if (!-f 'tmp/ranges.data') {	321	else {
336	system ("make tmp/ranges.data");	322	$base_score += $test->{score};
337	}	323	}
338		324
339	# read ranges, and mutableness, from ranges.data.
340	open (IN, "<tmp/ranges.data")
341	or die "need to run score-ranges-from-freqs first!";
342
343	my $count = 0;
344	while (<IN>) {
345	/^(\S+) (\S+) (\d+) (\S+)$/ or next;
346	my $t = $4;
347	$range_lo{$t} = $1+0;
348	$range_hi{$t} = $2+0;
349	my $mut = $3+0;
350
351	if ($allrules{$t}->{issubrule}) {
352	$ignored_rule{$t} = 1;
353	$mutable_tests{$t} = 0;
354	next;
355	}	325	}
356	if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) {
357	#warn "ignored rule: score and range == 0: $t\n";
358	$ignored_rule{$t} = 1;
359	$mutable_tests{$t} = 0;
360	next;
361	}
362		326
363	$ignored_rule{$t} = 0;	327	$out .= "b" . $base_score . "\n"; # score to add for non-mutable tests
364	$index_to_rule[$count] = $t;	328	$out .= "c" . $log->{count} . "\n"; # number of identical logs
365	$count++;
366		329
367	if (!$mut) {	330	print DAT "n" . $num_tests_hit . "\n" . $out;
368	$mutable_tests{$t} = 0;
369	} elsif ($range_lo{$t} == $range_hi{$t}) {
370	$mutable_tests{$t} = 0;
371	} elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
372	$mutable_tests{$t} = 0;
373	} else {
374	$mutable_tests{$t} = 1;
375	}
376	unless ($mutable_tests{$t} \|\| $scores{$t}) {
377	$ignored_rule{$t} = 1;
378	}
379	}
380	close IN;
381		331
382	# catch up on the ones missed; seems to be userconf or 0-hitters mostly.	332	$i++;
383	foreach my $t (sort keys %allrules) {
384	next if (exists($range_lo{$t}));
385	if ($allrules{$t}->{issubrule}) {
386	$ignored_rule{$t} = 1;
387	$mutable_tests{$t} = 0;
388	next;
389	}
390	$ignored_rule{$t} = 0;
391	unless (exists($mutable_tests{$t}) &&
392	($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) {
393	$mutable_tests{$t} = 0;
394	}
395	unless ($mutable_tests{$t} \|\| $scores{$t}) {
396	$ignored_rule{$t} = 1;
397	}
398	$index_to_rule[$count] = $t;
399	$count++;
400	}	333	}
401	foreach my $t (keys %range_lo) {
402	next if ($ignored_rule{$t});
403	if ($mutable_tests{$t}) {
404	if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
405	$scores{$t} = -1;
406	} elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
407	($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
408	$scores{$t} = -0.01;
409	}
410	if ($scores{$t} >= $range_hi{$t}) {
411	$scores{$t} = $range_hi{$t} - 0.001;
412	} elsif ($scores{$t} <= $range_lo{$t}) {
413	$scores{$t} = $range_lo{$t} + 0.001;
414	}
415	} else {
416	if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
417	next;
418	} elsif ($range_lo{$t} == $range_hi{$t}) {
419	$scores{$t} = $range_lo{$t};
420	next;
421	}
422	if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
423	$scores{$t} = -1;
424	} elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
425	($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
426	$scores{$t} = -0.01;
427	}
428	if ($scores{$t} > $range_hi{$t}) {
429	$scores{$t} = $range_hi{$t};
430	} elsif ($scores{$t} < $range_lo{$t}) {
431	$scores{$t} = $range_lo{$t};
432	}
433	}
434	}
435	}
436		334
437	sub evaluate {	335	close DAT;
438	my $fprate = ($ga_ny / $num_tests) * 100.0;
439	my $fnrate = ($ga_yn / $num_tests) * 100.0;
440		336
441	printf ("\n# SUMMARY for threshold %3.1f:\n", $threshold);
442	printf "# Correctly non-spam: %6d %4.2f%% (%4.2f%% of non-spam corpus)\n", $ga_nn,
443	($ga_nn / $num_tests) * 100.0, ($ga_nn / $num_nonspam) * 100.0;
444	printf "# Correctly spam: %6d %4.2f%% (%4.2f%% of spam corpus)\n" , $ga_yy,
445	($ga_yy / $num_tests) * 100.0, ($ga_yy / $num_spam) * 100.0;
446	printf "# False positives: %6d %4.2f%% (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny,
447	$fprate, ($ga_ny / $num_nonspam) * 100.0, $nyscore*$nybias;
448	printf "# False negatives: %6d %4.2f%% (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn,
449	$fnrate, ($ga_yn / $num_spam) * 100.0, $ynscore;
450		337
451	# convert to the TCR metrics used in the published lit
452	my $nspamspam = $ga_yy;
453	my $nspamlegit = $ga_yn;
454	my $nlegitspam = $ga_ny;
455	my $nlegitlegit = $ga_yn;
456	my $nlegit = $num_nonspam;
457	my $nspam = $num_spam;
458
459	my $werr = ($lambda * $nlegitspam + $nspamlegit)
460	/ ($lambda * $nlegit + $nspam);
461
462	my $werr_base = $nspam
463	/ ($lambda * $nlegit + $nspam);
464
465	$werr \|\|= 0.000001; # avoid / by 0
466	my $tcr = $werr_base / $werr;
467
468	my $sr = ($nspamspam / $nspam) * 100.0;
469	my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
470	printf "# TCR: %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%% FP: %3.2f%% FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate;
471	}	338	}
472		339
473	__DATA__
474		340
		341	__DATA__
475	void loadtests (void) {	342	void loadtests (void) {
476	FILE *fin = fopen ("tmp/tests.data", "r");	343	FILE *fin = fopen ("tmp/tests.data", "r");
477	char buf[256];	344	char buf[256];
Lines 557-560 Link Here
557		424
558	printf ("Read scores for %d tests.\n", num_scores);	425	printf ("Read scores for %d tests.\n", num_scores);
559	}	426	}
560

Lines 1-251 Link Here

(-)score-ranges-from-freqs (-251 lines)
1	#!/usr/bin/perl -w
2	#
3	# <@LICENSE>
4	# Copyright 2004 Apache Software Foundation
5	#
6	# Licensed under the Apache License, Version 2.0 (the "License");
7	# you may not use this file except in compliance with the License.
8	# You may obtain a copy of the License at
9	#
10	# http://www.apache.org/licenses/LICENSE-2.0
11	#
12	# Unless required by applicable law or agreed to in writing, software
13	# distributed under the License is distributed on an "AS IS" BASIS,
14	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15	# See the License for the specific language governing permissions and
16	# limitations under the License.
17	# </@LICENSE>
18
19	# (rough) graphic demo of this algorithm:
20	# 0.0 = -limit [......] 0 ........ limit
21	# 0.25 = -limit ..[..... 0 .]...... limit
22	# 0.5 = -limit ....[... 0 ...].... limit
23	# 0.75 = -limit ......[. 0 .....].. limit
24	# 1.0 = -limit ........ 0 [......] limit
25	my $sliding_window_limits = 4.8; # limits = [-$range, +$range]
26	my $sliding_window_size = 5.5; # scores have this range within limits
27
28	# 0.0 = -limit [......] 0 ........ limit
29	# 0.25 = -limit ....[... 0 ]....... limit
30	# 0.5 = -limit ......[. 0 .]...... limit (note: tighter)
31	# 0.75 = -limit .......[ 0 ...].... limit
32	# 1.0 = -limit ........ 0 [......] limit
33	my $shrinking_window_lower_base = 0.00;
34	my $shrinking_window_lower_range = 1.00; # *ratio, added to above
35	my $shrinking_window_size_base = 1.00;
36	my $shrinking_window_size_range = 1.00; # *ratio, added to above
37
38	my $use_sliding_window = 0;
39
40	my $argcffile = shift @ARGV;
41	my $scoreset = shift @ARGV;
42	$scoreset = 0 if ( !defined $scoreset );
43
44	if (defined ($argcffile) && $argcffile eq '-test') {
45	# use this to debug the ranking -> score-range mapping:
46	for $rat (0.0, 0.25, 0.5, 0.75, 1.0) {
47	my ($lo, $hi); if ($use_sliding_window) {
48	($lo, $hi) = sliding_window_ratio_to_range($rat);
49	} else {
50	($lo, $hi) = shrinking_window_ratio_to_range($rat);
51	}
52	warn "test: $rat => [ $lo $hi ]\n";
53	} exit;
54	}
55
56	my %freq_spam = ();
57	my %freq_nonspam = ();
58
59	my $num_spam;
60	my $num_nonspam;
61	my $num_total;
62
63	my %mutable_tests = ();
64	my %ranking = ();
65	my %soratio = ();
66	my %is_nice = ();
67
68	if (!defined $argcffile) { $argcffile = "../rules"; }
69	system ("./parse-rules-for-masses -d \"$argcffile\" -s $scoreset") and die;
70	if (-e "tmp/rules.pl") {
71	# Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem
72	require "./tmp/rules.pl";
73	}
74	else {
75	die "parse-rules-for-masses had no error but no tmp/rules.pl!?!";
76	}
77
78	while (<>) {
79	/^\s([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+\S+\s+(.+)\s$/ or next;
80
81	my $overall = $1+0;
82	my $spam = $2+0;
83	my $nonspam = $3+0;
84	my $soratio = $4+0;
85	my $ranking = $5+0;
86	my $test = $6;
87
88	if ($test eq '(all messages)') {
89	$num_spam = $spam;
90	$num_nonspam = $nonspam;
91	$num_total = $spam+$nonspam;
92	next;
93	}
94	next if ($test eq '(all messages as %)');
95
96	if (!defined ($rules{$test})) {
97	warn "rule $test no longer exists; ignoring\n";
98	next;
99	}
100
101	$freq{$test} = $overall;
102	$freq_spam{$test} = $spam;
103	$freq_nonspam{$test} = $nonspam;
104
105	my $tflags = $rules{$test}->{tflags}; $tflags \|\|= '';
106	if ($tflags =~ /\buserconf\b/ \|\|
107	( ($scoreset % 2) == 0 && $tflags =~ /\bnet\b/ )) {
108	$mutable_tests{$test} = 0;
109	} else {
110	$mutable_tests{$test} = 1;
111	}
112	if ($tflags =~ m/\bnice\b/i) {
113	$is_nice{$test} = 1;
114	} else {
115	$is_nice{$test} = 0;
116	}
117
118	if ($overall < 0.01) { # less than 0.01% of messages were hit
119	$mutable_tests{$test} = 0;
120	$soratio{$test} = 0.5;
121	$ranking{$test} = 0.0;
122	$rules{$test}->{score} = 0; # tvd - disable these rules automagically
123
124	} else {
125	$soratio{$test} = $soratio;
126	$ranking{$test} = $ranking;
127	}
128	}
129
130	if ( ! mkdir "tmp", 0755 ) {
131	warn "Couldn't create tmp directory!: $!\n";
132	}
133
134	open (OUT, ">tmp/ranges.data");
135	foreach my $test (sort { $ranking{$b} <=> $ranking{$a} } keys %freq) {
136	if (!defined ($rules{$test})) {
137	warn "no rule $test";
138	print OUT ("0 0 0 $test\n");
139	next;
140	}
141
142	my $overall = $freq{$test};
143	my $spam = $freq_spam{$test};
144	my $nonspam = $freq_nonspam{$test};
145	my $soratio = $soratio{$test};
146	my $ranking = $ranking{$test};
147	my $mutable = $mutable_tests{$test};
148
149	if (!$mutable \|\| $rules{$test}->{score} == 0) { # didn't look for score 0 - tvd
150	printf OUT ("%3.3f %3.3f 0 $test\n",
151	$rules{$test}->{score},
152	$rules{$test}->{score});
153	next;
154	}
155
156	# 0.0 = best nice, 1.0 = best nonnice
157	if ($is_nice{$test}) {
158	$ranking = .5 - ($ranking / 2);
159	} else {
160	$ranking = .5 + ($ranking / 2);
161	}
162
163	my ($lo, $hi);
164	if ($use_sliding_window) {
165	($lo, $hi) = sliding_window_ratio_to_range($ranking);
166	} else {
167	($lo, $hi) = shrinking_window_ratio_to_range($ranking);
168	}
169
170	# tvd
171	my $tflags = $rules{$test}->{tflags}; $tflags \|\|= '';
172	if ( $is_nice{$test} && ( $ranking < .5 ) ) { # proper nice rule
173	if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score # -5.4
174	$lo *=1.8;
175	}
176	elsif ($soratio <= 0.05 && $nonspam > 0.5) { # let good rules be larger if they want to, -4.5
177	$lo *= 1.5;
178	}
179
180	$hi = ($soratio == 0) ? $lo :
181	($soratio <= 0.005 ) ? $lo/1.1 :
182	($soratio <= 0.010 && $nonspam > 0.2) ? $lo/2.0 :
183	($soratio <= 0.025 && $nonspam > 1.5) ? $lo/10.0 :
184	0;
185
186	if ( $soratio >= 0.35 ) { # auto-disable bad rules
187	($lo,$hi) = (0,0);
188	}
189	}
190	elsif ( !$is_nice{$test} && ( $ranking >= .5 ) ) { # proper spam rule
191	if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score
192	$hi *=1.8;
193	}
194	elsif ( $soratio >= 0.99 && $spam > 1.0 ) {
195	$hi *= 1.5; # let good rules be larger if they want to
196	}
197
198	$lo = ($soratio == 1) ? $hi:
199	($soratio >= 0.995 ) ? $hi/4.0 :
200	($soratio >= 0.990 && $spam > 1.0) ? $hi/8.0 :
201	($soratio >= 0.900 && $spam > 10.0) ? $hi/24.0 :
202	0;
203
204	if ( $soratio <= 0.65 ) { # auto-disable bad rules
205	($lo,$hi) = (0,0);
206	}
207	}
208	else { # rule that has bad nice setting
209	($lo,$hi) = (0,0);
210	}
211	$mutable = 0 if ( $hi == $lo );
212
213	printf OUT ("%3.1f %3.1f $mutable $test\n", $lo, $hi);
214	}
215	close OUT;
216	exit;
217
218	sub sliding_window_ratio_to_range {
219	my $ratio = shift;
220	my $lo = -$sliding_window_limits + ($sliding_window_size * $ratio);
221	my $hi = +$sliding_window_limits - ($sliding_window_size * (1-$ratio));
222	if ($lo > $hi) { # ???
223	($lo,$hi) = ($hi,$lo);
224	}
225	($lo, $hi);
226	}
227
228	sub shrinking_window_ratio_to_range {
229	my $ratio = shift;
230	my $is_nice = 0;
231	my $adjusted = ($ratio -.5) * 2; # adj [0,1] to [-1,1]
232	if ($adjusted < 0) { $is_nice = 1; $adjusted = -$adjusted; }
233
234	#$adjusted /= 1.5 if ( $ratio < 0.95 && $ratio > 0.15 ); # tvd
235
236	my $lower = $shrinking_window_lower_base
237	+ ($shrinking_window_lower_range * $adjusted);
238	my $range = $shrinking_window_size_base
239	+ ($shrinking_window_size_range * $adjusted);
240	my $lo = $lower;
241	my $hi = $lower + $range;
242	if ($is_nice) {
243	my $tmp = $hi; $hi = -$lo; $lo = -$tmp;
244	}
245	if ($lo > $hi) { # ???
246	($lo,$hi) = ($hi,$lo);
247	}
248
249	($lo, $hi);
250	}
251

Lines 17-54 Link Here

(-)find-extremes (-153 / +182 lines)
17	# limitations under the License.	17	# limitations under the License.
18	# </@LICENSE>	18	# </@LICENSE>
19		19
20	use Getopt::Std;
21	getopts("l:L:h");
22		20
		21	use FindBin;
		22	use lib "$FindBin::Bin/../lib";
		23	use Mail::SpamAssassin::Masses;
		24	use Getopt::Long qw(:config bundling auto_help);
		25	use Pod::Usage;
		26	use strict;
		27	use warnings;
		28
23	use vars qw {	29	use vars qw {
24	$opt_h $opt_l $opt_L	30	$opt_c $opt_s $opt_l $opt_L $opt_inclang
25	};	31	};
26		32
27	sub usage {	33	GetOptions("c\|cffile=s@" => \$opt_c,
28	die "find-extremes [-l LC] [-L LC] [spam log] [nonspam log]	34	"s\|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
		35	"l\|logfile=s" => \$opt_l,
		36	"L\|language=s" => \$opt_L,
		37	"include-language=s" => \$opt_inclang);
29		38
30	-l LC also print language specific rules for lang code LC (or 'all')
31	-L LC only print language specific rules for lang code LC (or 'all')
32		39
33	options -l and -L are mutually exclusive.
34		40
35	if either the spam or and nonspam logs are unspecified, the defaults	41	my $lower = 1;
36	are \"spam.log\" and \"nonspam.log\" in the cwd.	42	#$threshold = 5;
		43	my $higher = 9;
		44	my $min_expected = 2; # Should not be set to more than 5 or less than 2
37		45
38	";	46
		47	=head1 NAME
		48
		49	find-extremes - Determine which rules are most likely to cause false positives/negatives.
		50
		51	=head1 SYNOPSIS
		52
		53	hit-frequencies [options]
		54
		55	Options:
		56	-c,--cffile=path Use path as the rules directory
		57	-s,--scoreset=n Use scoreset n
		58	-l,--logfile=file Read in file instead of masses.log
		59	-L,--language=lc Only print language specific tests for specified lang code (try 'all')
		60	--include-language=lc Also print language specific tests for specified lang code (try 'all')
		61
		62	=head1 DESCRIPTION
		63
		64	B<hit-frequencies> will read the mass-check log F<masses.log> or the
		65	log given by the B<--logfile> option. By default, B<hit-frequencies>
		66	will assume the proper values for B<--cffile> based on the header of
		67	the masses.log. The output will include the following columns:
		68
		69	=over 4
		70
		71	=item RULE
		72
		73	=item CHISQUARE
		74
		75	=item RATIO_FALSEPOS
		76
		77	=item OVER_FALSEPOS
		78
		79	=item FREQ_OVER
		80
		81	=back
		82
		83	=head1 BUGS
		84
		85	This script may or may not work as designed - it probably needs some
		86	tweaking, and I probably introduced a bug into it while re-writing for
		87	the new Masses stuff.
		88
		89	=head1 NOTES
		90
		91	This script is poorly documented. Patches welcome.
		92
		93	=cut
		94
		95
		96	$opt_s = 0 unless defined $opt_s;
		97
		98	my $ok_lang = lc ( $opt_inclang \|\| $opt_L \|\| '');
		99	$ok_lang = '.' if ($ok_lang eq 'all');
		100
		101	my $greprules = sub {
		102	my ($name, $rule) = @_;
		103
		104	return 0 if (($opt_L && !$rule->{lang}) \|\|
		105	($rule->{lang} &&
		106	(!$ok_lang \|\| $rule->{lang} !~ /^$ok_lang/i))); # Wrong language
		107
		108	return 0 if ($rule->{tflags} =~ /\bnet\b/);
		109
		110	return 1;
111
112	};
113
114	$opt_l \|\|= "masses.log";
115
116	if (!$opt_c \|\| !scalar(@$opt_c)) {
117	# Try to read this in from the log, if possible
118	open (IN, $opt_l) or die "Can't open $opt_l: $!";
119	my $files = 0; # are we in the files section?
120	while(<IN>) {
121	if (!$files) {
122	if (/^\# SVN revision:/) {
123	$opt_c = [ "$FindBin::Bin/../rules" ];
124	last;
125	} elsif (/^\# Using configuration:$/) {
126	$files = 1;
127	}
128	} elsif (/^\#\s+(.)\s$/) {
129	push (@$opt_c, $1);
130	} else {
131	# All done!
132	last;
133	}
134	}
135
136	foreach my $file (@$opt_c) {
137	die "Can't read $file" unless -r $file;
138	}
39	}	139	}
40		140
41	usage() if($opt_h \|\| ($opt_l && $opt_L));	141	my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
		142	scoreset => $opt_s,
		143	greprules => $greprules,
		144	logfile => $opt_l,
		145	nologs => 1});
42		146
43	$lower = 1;	147	$masses->readrules();
44	#$threshold = 5;	148	$masses->readlogs();
45	$higher = 9;
46	$min_expected = 2; # Should not be set to more than 5 or less than 2
47		149
48	my %freq_spam = (); # how often non-nice found in spam	150	my $rules = $masses->get_rules_hash();
		151	my $logs = $masses->get_logs();
		152
		153	my $num_spam = $masses->get_num_spam();
		154	my $num_ham = $masses->get_num_ham();
		155
49	my %freq_over_higher_falsepos = (); # how often non-nice found in ones over	156	my %freq_over_higher_falsepos = (); # how often non-nice found in ones over
50	# higher threshold that are false positives	157	# higher threshold that are false positives
51	my %freq_nonspam = (); # how often nice found in nonspam
52	my %freq_under_lower_falseneg = (); # how often nice found in ones under	158	my %freq_under_lower_falseneg = (); # how often nice found in ones under
53	# lower threshold that are false negatives	159	# lower threshold that are false negatives
54		160
Lines 59-101 Link Here
59	my %ratio_expected_falsepos = (); # ratio version of above	165	my %ratio_expected_falsepos = (); # ratio version of above
60	my %ratio_expected_falseneg = (); # ditto	166	my %ratio_expected_falseneg = (); # ditto
61		167
62	my $num_spam = 0;
63	my $num_nonspam = 0;
64	my $num_over_higher_falsepos = 0;	168	my $num_over_higher_falsepos = 0;
65	my $num_under_lower_falseneg = 0;	169	my $num_under_lower_falseneg = 0;
66	my $ok_lang = '';
67		170
68	readscores();	171	my %chisquare = ( );
		172	my %prob = ( );
69		173
70	$ok_lang = lc ($opt_l \|\| $opt_L \|\| '');
71	if ($ok_lang eq 'all') { $ok_lang = '.'; }
72		174
73	foreach my $key (keys %rules) {	175	foreach my $key (keys %$rules) {
74		176
75	if ( ($opt_L && !$rules{$key}->{lang}) \|\|	177	if ($rules->{$key}->{tflags} !~ /\buserconf\b/) {
76	($rules{$key}->{lang} &&	178	if ($rules->{$key}->{tflags} =~ m/nice/) {
77	(!$ok_lang \|\| $rules{$key}->{lang} !~ /^$ok_lang/i)
78	) ) {
79	delete $rules{$key} ; next;
80	}
81
82	if ($rules{$key}->{tflags} =~ m/net/) {
83	delete $rules{$key};
84	next;
85	}
86	if ($rules{$key}->{tflags} !~ m/userconf/) {
87	if ($rules{$key}->{tflags} =~ m/nice/) {
88	$freq_nonspam{$key} = 0;
89	$freq_under_lower_falseneg{$key} = 0;	179	$freq_under_lower_falseneg{$key} = 0;
90	} else {	180	} else {
91	$freq_spam{$key} = 0;
92	$freq_over_higher_falsepos{$key} = 0;	181	$freq_over_higher_falsepos{$key} = 0;
93	}	182	}
94	}	183	}
		184
95	}	185	}
96		186
97	readlogs();	187	foreach my $log (@$logs) {
98		188
		189	if($log->{isspam}) {
		190	# Also need to count plus_hits
		191	my $plus_hits = 0;
		192	foreach my $test (@{$log->{tests_hit}}) {
		193	$plus_hits += $test->{score} if ($test->{score} > 0);
		194	}
		195
		196	if(($log->{score} <= $lower) && $plus_hits && $plus_hits >= $lower) {
		197	$num_under_lower_falseneg++;
		198	foreach my $test (@{$log->{tests_hit}}) {
		199	$num_under_lower_falseneg++;
		200	$freq_under_lower_falseneg{$test->{name}}++ if exists $freq_under_lower_falseneg{$test->{name}};
		201	}
		202	}
		203	}
		204	else {
		205	if($log->{score} > $higher) {
		206	$num_over_higher_falsepos++;
		207	foreach my $test (@{$log->{tests_hit}}) {
		208	$num_over_higher_falsepos++;
		209	$freq_over_higher_falsepos{$test->{name}}++ if exists $freq_over_higher_falsepos{$test->{name}};
		210	}
		211	}
		212	}
		213
		214	}
		215
99	unless (($num_over_higher_falsepos >= $min_expected)	216	unless (($num_over_higher_falsepos >= $min_expected)
100	&& ($num_under_lower_falseneg >= $min_expected)) {	217	&& ($num_under_lower_falseneg >= $min_expected)) {
101	die "Insufficient extremes in dataset (" . $num_over_higher_falsepos .	218	die "Insufficient extremes in dataset (" . $num_over_higher_falsepos .
Lines 119-130 Link Here
119	}	236	}
120		237
121	my $ratio_falsepos = $num_over_higher_falsepos/$num_spam;	238	my $ratio_falsepos = $num_over_higher_falsepos/$num_spam;
122	my $ratio_falseneg = $num_under_lower_falseneg/$num_nonspam;	239	my $ratio_falseneg = $num_under_lower_falseneg/$num_ham;
123		240
124	my $skipped_non_nice = 0;	241	my $skipped_non_nice = 0;
125		242
126	foreach $rule (keys %freq_spam) {	243	# non-nice rules
127	my $expected = $freq_spam{$rule}*$ratio_falsepos;	244	foreach my $rule (keys %freq_over_higher_falsepos) {
		245	my $expected = $rules->{$rule}->{freq_spam}*$ratio_falsepos;
128	if ($expected <= $min_expected) {	246	if ($expected <= $min_expected) {
129	$skipped_non_nice++;	247	$skipped_non_nice++;
130	next;	248	next;
Lines 136-142 Link Here
136	$freq_over_higher_falsepos{$rule}/$expected;	254	$freq_over_higher_falsepos{$rule}/$expected;
137	($chisquare{$rule},$prob{$rule}) =	255	($chisquare{$rule},$prob{$rule}) =
138	chisquare($num_spam,$num_over_higher_falsepos,	256	chisquare($num_spam,$num_over_higher_falsepos,
139	$freq_spam{$rule},$freq_over_higher_falsepos{$rule});	257	$rules->{$rule}->{freq_spam},$freq_over_higher_falsepos{$rule});
140	if ($freq_over_higher_falsepos{$rule} < $expected) {	258	if ($freq_over_higher_falsepos{$rule} < $expected) {
141	$chisquare{$rule} *= -1;	259	$chisquare{$rule} *= -1;
142	}	260	}
Lines 146-153 Link Here
146		264
147	my $skipped_nice = 0;	265	my $skipped_nice = 0;
148		266
149	foreach $rule (keys %freq_nonspam) {	267	# nice rules
150	my $expected = $freq_nonspam{$rule}*$ratio_falseneg;	268	foreach my $rule (keys %freq_under_lower_falseneg) {
		269	my $expected = $rules->{$rule}->{freq_ham}*$ratio_falseneg;
151	if ($expected <= $min_expected) {	270	if ($expected <= $min_expected) {
152	$skipped_nice++;	271	$skipped_nice++;
153	next;	272	next;
Lines 158-165 Link Here
158	$ratio_expected_falseneg{$rule} =	277	$ratio_expected_falseneg{$rule} =
159	$freq_under_lower_falseneg{$rule}/$expected;	278	$freq_under_lower_falseneg{$rule}/$expected;
160	($chisquare{$rule},$prob{$rule}) =	279	($chisquare{$rule},$prob{$rule}) =
161	chisquare($num_nonspam,$num_under_lower_falseneg,	280	chisquare($num_ham,$num_under_lower_falseneg,
162	$freq_nonspam{$rule},$freq_under_lower_falseneg{$rule});	281	$rules->{$rule}->{freq_ham},$freq_under_lower_falseneg{$rule});
163	if ($freq_under_lower_falseneg{$rule} < $expected) {	282	if ($freq_under_lower_falseneg{$rule} < $expected) {
164	$chisquare{$rule} *= -1;	283	$chisquare{$rule} *= -1;
165	}	284	}
Lines 167-174 Link Here
167		286
168	warn "Skipped nice: $skipped_nice\n";	287	warn "Skipped nice: $skipped_nice\n";
169		288
170	@rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos);	289	# The rest is copied verbatim from before - its complicated and not
		290	# commented and should work unchanged except for the freq_spam and
		291	# freq_ham stuff and fixing some use strict stuff
171		292
		293	my @rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos);
		294
172	if (scalar(@rules_falsepos)) {	295	if (scalar(@rules_falsepos)) {
173	print "RULE\t\tCHISQUARE\tRATIO_FALSEPOS\tOVER_FALSEPOS\tFREQ_OVER ($num_over_higher_falsepos)\n";	296	print "RULE\t\tCHISQUARE\tRATIO_FALSEPOS\tOVER_FALSEPOS\tFREQ_OVER ($num_over_higher_falsepos)\n";
174	my(@rules_falsepos_bad) =	297	my(@rules_falsepos_bad) =
Lines 183-189 Link Here
183	$over_expected_falsepos{$a}) \|\|	306	$over_expected_falsepos{$a}) \|\|
184	($freq_over_higher_falsepos{$b} <=>	307	($freq_over_higher_falsepos{$b} <=>
185	$freq_over_higher_falsepos{$a})} (@rules_falsepos_bad);	308	$freq_over_higher_falsepos{$a})} (@rules_falsepos_bad);
186	foreach $rule (@rules_falsepos_bad) {	309	foreach my $rule (@rules_falsepos_bad) {
187	print $rule . "\t" . $prob{$rule} . "\t" .	310	print $rule . "\t" . $prob{$rule} . "\t" .
188	$ratio_expected_falsepos{$rule} . "\t" .	311	$ratio_expected_falsepos{$rule} . "\t" .
189	$over_expected_falsepos{$rule} . "\t" .	312	$over_expected_falsepos{$rule} . "\t" .
Lines 199-207 Link Here
199	($chisquare{$a} <=> $chisquare{$b}) \|\|	322	($chisquare{$a} <=> $chisquare{$b}) \|\|
200	($ratio_expected_falsepos{$a} <=>	323	($ratio_expected_falsepos{$a} <=>
201	$ratio_expected_falsepos{$b}) \|\|	324	$ratio_expected_falsepos{$b}) \|\|
202	($freq_spam{$b} <=>	325	($rules->{$b}->{freq_spam} <=>
203	$freq_spam{$a})} (@rules_falsepos_good);	326	$rules->{$a}->{freq_spam})} (@rules_falsepos_good);
204	foreach $rule (@rules_falsepos_good) {	327	foreach my $rule (@rules_falsepos_good) {
205	print $rule . "\t" . $prob{$rule} . "\t" .	328	print $rule . "\t" . $prob{$rule} . "\t" .
206	$ratio_expected_falsepos{$rule} . "\t" .	329	$ratio_expected_falsepos{$rule} . "\t" .
207	$over_expected_falsepos{$rule} . "\t" .	330	$over_expected_falsepos{$rule} . "\t" .
Lines 212-218 Link Here
212	warn "No over-falsepos to print\n";	335	warn "No over-falsepos to print\n";
213	}	336	}
214		337
215	@rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg);	338	my @rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg);
216		339
217	if (scalar(@rules_falseneg)) {	340	if (scalar(@rules_falseneg)) {
218	print "RULE\t\tCHISQUARE\tRATIO_FALSENEG\tOVER_FALSENEG\tFREQ_UNDER ($num_under_lower_falseneg)\n";	341	print "RULE\t\tCHISQUARE\tRATIO_FALSENEG\tOVER_FALSENEG\tFREQ_UNDER ($num_under_lower_falseneg)\n";
Lines 228-234 Link Here
228	$over_expected_falseneg{$a}) \|\|	351	$over_expected_falseneg{$a}) \|\|
229	($freq_under_lower_falseneg{$b} <=>	352	($freq_under_lower_falseneg{$b} <=>
230	$freq_under_lower_falseneg{$a})} (@rules_falseneg_bad);	353	$freq_under_lower_falseneg{$a})} (@rules_falseneg_bad);
231	foreach $rule (@rules_falseneg_bad) {	354	foreach my $rule (@rules_falseneg_bad) {
232	print $rule . "\t" . $prob{$rule} . "\t" .	355	print $rule . "\t" . $prob{$rule} . "\t" .
233	$ratio_expected_falseneg{$rule} . "\t" .	356	$ratio_expected_falseneg{$rule} . "\t" .
234	$over_expected_falseneg{$rule} . "\t" .	357	$over_expected_falseneg{$rule} . "\t" .
Lines 244-252 Link Here
244	($chisquare{$a} <=> $chisquare{$b}) \|\|	367	($chisquare{$a} <=> $chisquare{$b}) \|\|
245	($ratio_expected_falseneg{$a} <=>	368	($ratio_expected_falseneg{$a} <=>
246	$ratio_expected_falseneg{$b}) \|\|	369	$ratio_expected_falseneg{$b}) \|\|
247	($freq_spam{$b} <=>	370	($rules->{$b}->{freq_ham} <=>
248	$freq_spam{$a})} (@rules_falseneg_good);	371	$rules->{$a}->{freq_ham})} (@rules_falseneg_good);
249	foreach $rule (@rules_falseneg_good) {	372	foreach my $rule (@rules_falseneg_good) {
250	print $rule . "\t" . $prob{$rule} . "\t" .	373	print $rule . "\t" . $prob{$rule} . "\t" .
251	$ratio_expected_falseneg{$rule} . "\t" .	374	$ratio_expected_falseneg{$rule} . "\t" .
252	$over_expected_falseneg{$rule} . "\t" .	375	$over_expected_falseneg{$rule} . "\t" .
Lines 258-354 Link Here
258	}	381	}
259		382
260	exit;	383	exit;
261
262	sub readlogs {
263	my $spam = $ARGV[0] \|\| "spam.log";
264	my $nonspam = $ARGV[1] \|\| (-f "good.log" ? "good.log" : "nonspam.log");
265
266
267	(open(NONSPAM,$nonspam)) \|\|
268	(die "Couldn't open file '$nonspam': $!; stopped");
269
270	while (defined($line = <NONSPAM>)) {
271	if ($line =~ m/^\s*\#/) {
272	next;
273	} elsif ($line =~ m/^.\s+-?\d+\s+\S+\s(\S)/) {
274	my $tests = $1;
275	my $hits = 0;
276	my(@tests) = ();
277	foreach $test (grep {length($_)} (split(/,+/,$tests))) {
278	if (exists($rules{$test})) {
279	push @tests, $test;
280	$hits += $rules{$test}->{score};
281	}
282	}
283
284	if (scalar(@tests)) {
285	$num_nonspam++;
286	foreach $test (grep {exists($freq_nonspam{$_})} (@tests)) {
287	$freq_nonspam{$test}++;
288	}
289	if ($hits >= $higher) {
290	$num_over_higher_falsepos++;
291	foreach $test (grep
292	{exists($freq_over_higher_falsepos{$_})} (@tests)) {
293	$freq_over_higher_falsepos{$test}++;
294	}
295	}
296	}
297	} elsif ($line =~ m/\S/) {
298	chomp($line);
299	warn "Can't interpret line '$line'; skipping";
300	}
301	}
302
303	close(NONSPAM);
304
305	(open(SPAM,$spam)) \|\| (die "Couldn't open file '$spam': $!; stopped");
306
307	while (defined($line = <SPAM>)) {
308	if ($line =~ m/^\s*\#/) {
309	next;
310	} elsif ($line =~ m/^.\s+-?\d+\s+\S+\s(\S)/) {
311	my $tests = $1;
312	my $hits = 0;
313	my $plus_hits = 0;
314	my(@tests) = ();
315	foreach $test (grep {length($_)} (split(/,+/,$tests))) {
316	if (exists($rules{$test})) {
317	push @tests, $test;
318	$hits += $rules{$test}->{score};
319	if ($rules{$test}->{score} > 0) {
320	$plus_hits += $rules{$test}->{score};
321	}
322	}
323	}
324
325	if (scalar(@tests)) {
326	$num_spam++;
327	foreach $test (grep {exists($freq_spam{$_})} (@tests)) {
328	$freq_spam{$test}++;
329	}
330	if (($hits <= $lower) && $plus_hits &&
331	($plus_hits >= $lower)) {
332	$num_under_lower_falseneg++;
333	foreach $test (grep
334	{exists($freq_under_lower_falseneg{$_})} (@tests)) {
335	$freq_under_lower_falseneg{$test}++;
336	}
337	}
338	}
339	} elsif ($line =~ m/\S/) {
340	chomp($line);
341	warn "Can't interpret line '$line'; skipping";
342	}
343	}
344
345	close(SPAM);
346	}
347
348
349	sub readscores {
350	system ("./parse-rules-for-masses") and
351	die "Couldn't do parse-rules-for-masses: $?; stopped";
352	require "./tmp/rules.pl";
353	}
354