View | Details | Raw Unified | Return to bug 2853
Collapse All | Expand All

(-)lib/Mail/SpamAssassin/Masses.pm (+833 lines)
Line 0 Link Here
1
# <@LICENSE>
2
# Copyright 2004 Apache Software Foundation
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
# </@LICENSE>
16
17
=head1 NAME
18
19
Mail::SpamAssassin::Masses - Interface for reading and parsing rules
20
and mass-check logs for SpamAssassin
21
22
=head1 SYNOPSIS
23
24
  my $parser = Mail::SpamAssassin::Masses->new();
25
  my $rules = $parser->readrules();
26
  my $logs = $parser->readlogs();
27
28
  foreach my $test (keys %$rules) {
29
    if ($rules->{$test}->{score} > 1) {
30
      ...
31
    }
32
33
=head1 DESCRIPTION
34
35
Mail::SpamAssassin::Masses is a module to simplify the many scripts
36
that used to make up the SpamAssassin re-scoring process. By
37
consolidating all the shared code in one module, the scripts can be
38
simplified and require fewer temporary files.
39
40
=head1 METHODS
41
42
=over 4
43
44
=cut
45
46
package Mail::SpamAssassin::Masses;
47
48
use strict;
49
use warnings;
50
use Carp;
51
52
=item $parser = Mail::SpamAssassin::Masses->new( [ { opt => val, ... } ] );
53
54
Construct a new Mail::SpamAssassin::Masses object. You may pass the
55
following attribute-value pairs to the constructor.
56
57
=over 4
58
59
=item rulesdir
60
61
The directory containing rules. If multiple directories are desired,
62
an anonymous array should be passed.
63
64
=item scoreset
65
66
Scoreset to deal with.
67
68
=item logfile
69
70
Filename of mass-check log.
71
72
=item falses
73
74
Also count frequencies for false positives and false negatives from
75
the logs.
76
77
=item falsesonly
78
79
Only count false positives and false negatives.
80
81
=item greprule
82
83
Coderef that is passed a rule name and a hash ref with the entries
84
containing info about the rule. If the sub returns false, it is skipped.
85
86
=item greplog
87
88
Coderef that is passed a raw log entry. If it returns false, the entry
89
is skipped.
90
91
=item sliding_window
92
93
Use a sliding window for score ranges rather than a shrinking window.
94
95
=item nologs
96
97
Save memory by not saving the individual log results, just the
98
aggregate totals
99
100
=back
101
102
=cut
103
104
sub new {
105
106
  my $class = shift;
107
  $class = ref($class) || $class;
108
109
  my $self = shift;
110
  if (!defined $self){
111
    $self = { };
112
  }
113
114
  $self->{scoreset} ||= 0;
115
  $self->{rulesdir} ||= '';
116
  $self->{logfile} ||= "masses.log";
117
118
  bless($self, $class);
119
120
  return $self;
121
122
}
123
124
=item $parser->readrules()
125
126
Read and parse the rules from the directory specified as
127
C<rulesdir>. This loads the following keys and values into the hash
128
entry representing the rules (see below).
129
130
=over 4
131
132
=item name
133
134
Contains the rule's name.
135
136
=item score
137
138
Contains the rule's score.
139
140
=item type
141
142
Contains the rule's type (header, body, uri, etc.)
143
144
=item tflags
145
146
Contains the rules tflags (nice, autolearn, etc.) as specified in the config file.
147
148
=item lang
149
150
Set to the value of C<lang> for language-specific tests.
151
152
=item issubrule
153
154
Set to true if the rules is a sub-rule, (i.e. it starts with
155
__). Otherwise, undefined.
156
157
=item isnice
158
159
This key exists and is true if the rule is nice (i.e. with a score
160
that can be below zero).
161
162
=item describe
163
164
Set to the rule's description, in English, or in the rule's language.
165
166
=back
167
168
There may be more values once C<readlogs()> is run.
169
170
=cut
171
172
173
sub readrules {
174
175
  my $self = shift;
176
177
  $self->{rules} ||= { };
178
  my $rules = $self->{rules}; # $rules is a reference to the anon hash
179
180
  my @dirs = ref($self->{rulesdir}) ? @{$self->{rulesdir}} : $self->{rulesdir};
181
182
  foreach my $indir (@dirs) {
183
    my @files = glob("$indir/*.cf"); # no reason to only do numbered files
184
185
    foreach my $file (@files) {
186
      open (IN, "<$file") || croak("Can't open $file, $!");
187
      while(<IN>) {
188
        s/#.*$//g;
189
        s/^\s+//;
190
        s/\s+$//;
191
        next if /^$/;
192
193
        my $lang = '';
194
        if (s/^lang\s+(\S+)\s+//) {
195
          $lang = lc $1;
196
        }
197
198
        if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) {
199
          my $type = $1;
200
          my $name = $2;
201
202
          $rules->{$name} ||= { };
203
	  $rules->{$name}->{name} = $name;
204
          $rules->{$name}->{type} = $type;
205
          $rules->{$name}->{lang} = $lang if $lang;
206
          $rules->{$name}->{tflags} = '';
207
208
          if ($name =~ /^__/) {
209
	    $rules->{$name}->{issubrule} = '1';
210
	  }
211
212
        } elsif (/^describe\s+(\S+)\s+(.+)$/) {
213
214
          # Let's get description in english, por favor -- unless the rule isn't english
215
216
	  next if ($lang && (!$rules->{$1}->{lang} || $rules->{$1}->{lang} ne $lang));
217
218
          $rules->{$1} ||= { };
219
          $rules->{$1}->{describe} = $2;
220
221
        } elsif (/^tflags\s+(\S+)\s+(.+)$/) {
222
	  my $name = $1;
223
          $rules->{$name} ||= { };
224
          $rules->{$name}->{tflags} = $2;
225
	  if ($2 =~ /nice/) {
226
	    $rules->{$name}->{isnice} = 1;
227
	  }
228
        } elsif (/^score\s+(\S+)\s+(.+)$/) {
229
          my($name,$score) = ($1,$2);
230
          $rules->{$name} ||= { };
231
          if ( $score =~ /\s/ ) { # there are multiple scores
232
            ($score) = (split(/\s+/,$score))[$self->{scoreset}];
233
          }
234
          $rules->{$name}->{score} = $score;
235
        }
236
      }
237
      close IN;
238
    }
239
  }
240
  foreach my $rule (keys %{$rules}) {
241
    if (!defined $rules->{$rule}->{type}) {
242
      delete $rules->{$rule};   # no rule definition -> no rule
243
      next;
244
    }
245
246
    if (!defined $rules->{$rule}->{score}) {
247
      my $def = 1.0;
248
      if ($rule =~ /^T_/) { $def = 0.01; }
249
250
      if ($rules->{$rule}->{isnice}) {
251
        $rules->{$rule}->{score} = -$def;
252
      } else {
253
        $rules->{$rule}->{score} = $def;
254
      }
255
    }
256
257
    if ($self->{greprules} && !&{$self->{greprules}}($rule, $rules->{$rule}))
258
    {
259
      delete $rules->{$rule};
260
      next;
261
    }
262
263
  }
264
265
  $self->{_readrules} = 1;
266
}
267
268
=item $parser->readlogs()
269
270
Read and parse logs from C<logsdir>. This will create the anonymous
271
array of hashes referred to by C<$parser->{logs}>, with the following
272
keys:
273
274
=over 4
275
276
=item isspam
277
278
True if the message is spam. False or undefined otherwise.
279
280
=item isfalse
281
282
True if the message was a false negative or positive.
283
284
=item tests_hit
285
286
Array reference containing references to the hash representing each
287
rule hit.
288
289
=item score
290
291
Score the message received (under current scores).
292
293
=back
294
295
In addition, this method adds the following keys to the rule
296
information in C<$parser->{rules}>.
297
298
=over 4
299
300
=item freq_spam
301
302
Frequency hit in spam.
303
304
=item freq_ham
305
306
Frequency hit in ham.
307
308
=item freq_fp
309
310
Frequency in false positives.
311
312
=item freq_fn
313
314
Frequency in false negatives.
315
316
=back
317
318
Also, sets C<$parser->{num_spam}> and C<$parser->{num_ham}> to the number of
319
spam logs read and the number of ham logs read, respectively.
320
321
=cut
322
323
sub readlogs {
324
325
  my $self = shift;
326
327
  if (!$self->{_readrules}) {
328
    # need to read scores first!
329
    $self->readrules();
330
  }
331
332
  my $rules = $self->{rules}; # copy the ref, shorthand
333
334
  my $logs;
335
  if (! $self->{nologs}) {
336
    $self->{logs} ||= [ ];
337
    $logs = $self->{logs};
338
  }
339
340
341
  my ($num_spam, $num_ham, $count, $num_fp, $num_fn);
342
  $num_spam = $num_ham = $count = $num_fp = $num_fn = 0;
343
344
  # First, initialize stuff
345
  foreach my $rule (values %{$self->{rules}}) {
346
    $rule->{freq_spam} ||= 0;
347
    $rule->{freq_ham} ||= 0;
348
349
    if($self->{falses}) {
350
      $rule->{freq_fp} ||= 0;
351
      $rule->{freq_fn} ||= 0;
352
    }
353
354
  }
355
356
  my $file = $self->{logfile};
357
  open (IN, "<$file");
358
359
  while (<IN>) {
360
    next if /^\#/;
361
    next if /^$/;
362
    if($_ !~ /^(.)\s+(.)\s+-?[\d.]+\s+\S+(\s+\S+\s+)/) { warn "bad line: $_"; next; }
363
364
    if ($self->{greplogs} && !&{$self->{greplogs}}($_)) {
365
      next;
366
    }
367
368
    my $manual = $1;
369
    my $result = $2;
370
    $_ = $3;
371
    s/(?:bayes|time)=\S+//;
372
    s/,,+/,/g;
373
    s/^\s+//;
374
    s/\s+$//;
375
376
377
    if ($manual ne $result) {
378
      $self->{isfalse} = 1;
379
    }
380
    elsif ($self->{falsesonly}) {
381
      next;
382
    }
383
384
    if ($manual eq "s") {
385
      $num_spam++;
386
      $logs->[$count]->{isspam} = 1 unless $self->{nologs};
387
      $num_fn++ if $result eq "h";
388
    } else {
389
      $num_ham++;
390
      $num_fp++ if $result eq "s";
391
    }
392
393
    my @tests = ();
394
    my $score = 0;
395
    foreach my $tst (split (/,/, $_)) {
396
      next if ($tst eq '');
397
398
      # Don't count non-existant rules
399
      # (Could happen with greprules)
400
      next if ( !$rules->{$tst} || !$rules->{$tst}->{type} );
401
402
      if ($manual eq "s") {
403
	  $rules->{$tst}->{freq_spam}++;
404
	  $rules->{$tst}->{freq_fn}++ if ($self->{falses} && $result eq "h");
405
      }
406
      else {
407
	  $rules->{$tst}->{freq_ham}++;
408
	  $rules->{$tst}->{freq_fp}++ if ($self->{falses} && $result eq "s");
409
      }
410
411
      $score += $rules->{$tst}->{score};
412
413
      push (@tests, $rules->{$tst}) unless $self->{nologs};
414
    }
415
416
    $logs->[$count]->{tests_hit} = \@tests unless $self->{nologs};
417
    $logs->[$count]->{score} = $score;
418
419
    $count++;
420
  }
421
  close IN;
422
423
  $self->{num_spam} = $num_spam;
424
  $self->{num_ham} = $num_ham;
425
  if ($self->{falses}) {
426
    $self->{num_fn} = $num_fn;
427
    $self->{num_fp} = $num_fp;
428
  }
429
430
  $self->{_readlogs} = 1; # Done reading logs
431
432
}
433
434
=item $parser->do_statistics();
435
436
Calculate the S/O ratio and the rank for each test.
437
438
This adds the following keys to the rules hashes.
439
440
=over 4
441
442
=item spam_percent
443
444
Percentage of spam messages hit.
445
446
=item ham_percent
447
448
Percentage of ham messages hit.
449
450
=item soratio
451
452
S/O ratio -- percentage of spam messages hit divided by total
453
percentage of messages hit.
454
455
=back
456
457
=cut
458
459
sub do_statistics {
460
  my $self = shift;
461
462
  if (! $self->{_readlogs} ) {
463
    $self->readlogs();
464
  }
465
466
  my $rank_hi=0;
467
  my $rank_lo=999999;
468
469
  foreach my $rule (values %{$self->{rules}}) {
470
471
    if (!$rule->{freq_spam}) {
472
      $rule->{spam_percent} = 0;
473
    } else {
474
      $rule->{spam_percent} = $rule->{freq_spam} / $self->{num_spam} * 100.0;
475
    }
476
477
    if (!$rule->{freq_ham}) {
478
      $rule->{ham_percent} = 0;
479
    } else {
480
      $rule->{ham_percent} = $rule->{freq_ham} / $self->{num_ham} * 100.0;
481
    }
482
483
    if (!$rule->{freq_spam} && !$rule->{freq_ham}) {
484
      $rule->{soratio} = 0.5;
485
      next;
486
    }
487
488
    $rule->{soratio} = $rule->{spam_percent} / ($rule->{spam_percent} + $rule->{ham_percent});
489
490
  }
491
492
  $self->{_statistics} = 1;
493
494
}
495
496
=item $parser->do_rank();
497
498
Calculates the ranking for each rule and stores this in the
499
appropriate key.
500
501
=over 4
502
503
=item rank
504
505
"Rank" of the rule. High numbers are good, low are bad.
506
507
=back
508
509
=cut
510
511
sub do_rank {
512
513
  my $self = shift;
514
515
  if (! $self->{_statistics} ) {
516
    $self->do_statistics();
517
  }
518
519
  my $rank_hi = 0;
520
  my $rank_lo = 9999999;
521
522
  my %unwanted;
523
  my %wanted;
524
  my %wranks;
525
  my %uranks;
526
  my $rules = $self->{rules};
527
528
529
  foreach my $rule (values %{$self->{rules}}) {
530
531
    $wanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_ham} : $rule->{freq_spam};
532
    $unwanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_spam} : $rule->{freq_ham};
533
534
    $wranks{$wanted{$rule->{name}}} = 1;
535
    $uranks{$unwanted{$rule->{name}}} = 1;
536
537
  }
538
539
  my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
540
  my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;
541
542
  # first half of ranking is the wanted rank
543
  my $position = 0;
544
  my $last = undef;
545
546
  foreach my $test (@wanted) {
547
    $position++ if defined $last && $last != $wanted{$test};
548
    $rules->{$test}->{rank} += $position;
549
    $last = $wanted{$test};
550
  }
551
552
  # second half is the unwanted rank
553
  $position = 0;
554
  $last = undef;
555
  my $normalize = (scalar keys %wranks) / (scalar keys %uranks);
556
557
  foreach my $test (@unwanted) {
558
    $position++ if defined $last && $last != $unwanted{$test};
559
    $rules->{$test}->{rank} += ($position * $normalize);
560
    $last = $unwanted{$test};
561
    $rank_hi = $rules->{$test}->{rank} if ($rules->{$test}->{rank} > $rank_hi);
562
    $rank_lo = $rules->{$test}->{rank} if ($rules->{$test}->{rank} < $rank_lo);
563
  }
564
565
  $rank_hi = $rank_hi - $rank_lo;
566
  foreach my $rule (values %{$rules}) {
567
    $rule->{rank} = ($rank_hi == 0) ? 0.001 : (($rule->{rank} - $rank_lo)/ $rank_hi);
568
  }
569
570
  $self->{_rank} = 1;
571
}
572
573
=item $parser->get_rules_array();
574
575
Returns a reference to an array of hash references. The values of
576
these hash have keys as listed above.
577
578
=cut
579
580
sub get_rules_array {
581
  my $self = shift;
582
  return [ values %{$self->{rules}} ];
583
}
584
585
=item $parser->get_rules_hash();
586
587
Returns a reference to a hash with rule names as keys and hash
588
references as values. The values of these hash have keys as listed
589
above.
590
591
=cut
592
593
sub get_rules_hash {
594
  my $self = shift;
595
  return $self->{rules};
596
}
597
598
=item $parser->get_logs();
599
600
Returns a reference to the array containing log entries, in the form
601
of anonymous hashes with keys as described above.
602
603
=cut
604
605
sub get_logs {
606
  my $self = shift;
607
  return $self->{logs};
608
}
609
610
=item $parser->get_num_ham();
611
612
Returns number of ham logs read.
613
614
=cut
615
616
sub get_num_ham {
617
  my $self = shift;
618
  return $self->{num_ham};
619
}
620
621
=item $parser->get_num_spam();
622
623
Returns number of spam logs read.
624
625
=cut
626
627
sub get_num_spam {
628
  my $self = shift;
629
  return $self->{num_spam};
630
}
631
632
=item $parser->do_score_ranges();
633
634
Figure out range in which score can be set based on the soratio, etc.
635
636
This is necessary so that the perceptron doesn't set silly
637
scores. (This may not be as much of a problem as it was with the old
638
GA.)
639
640
This adds the following keys to the rules hashes:
641
642
=over 4
643
644
=item ismutable
645
646
Determines whether the perceptron can select a score for this test.
647
648
=item range_lo
649
650
Determines the lowest score the perceptron can set.
651
652
=item range_hi
653
654
Determines the highest score the perceptron can set.
655
656
=cut
657
658
sub do_score_ranges() {
659
660
  my $self = shift;
661
662
  if ( !$self->{_statistics} ) {
663
    $self->do_statistics();
664
  }
665
  if ( !$self->{_rank} ) {
666
    $self->do_rank();
667
  }
668
669
  foreach my $rule (values %{$self->{rules}}) {
670
671
    my ($rank, $lo, $hi);
672
673
    $rank = $rule->{rank};
674
675
    # Get rid of rules that don't hit -- and disable completely.
676
    if ($rule->{spam_percent} + $rule->{ham_percent} < 0.01 ||
677
	$rule->{score} == 0) {
678
679
      $rule->{ismutable} = 0;
680
      $rule->{range_lo} = $rule->{range_hi} = 0;
681
      next;
682
683
    }
684
685
    # next: get rid of tests that don't apply in this scoreset
686
    # or are userconf -- set ismutable to 0, but keep the score
687
    if ($rule->{tflags} =~ /\buserconf\b/ ||
688
	(($self->{scoreset} % 2) == 0 && $rule->{tflags} =~/\bnet\b/)) {
689
690
      $rule->{ismutable} = 0;
691
      $rule->{range_lo} = $rule->{range_hi} = $rule->{score};
692
      next;
693
694
    }
695
696
697
    # Normal rules:
698
699
    # This seems to convert from [-1,1] to [0,1] but we're already in
700
    # [0,1] space - Is this right?
701
702
    # The current way ranks are calculated, > 0.5 and < 0.5 have no
703
    # special meaning
704
705
#      # 0.0 = best nice, 1.0 = best nonnice
706
#      if ($rule->{isnice}) {
707
#        $rank = .5 - ($rank / 2);
708
#      } else {
709
#        $rank = .5 + ($rank / 2);
710
#      }
711
712
    # ranks are really meant to be used as human readable - not as
713
    # tools to choose score range - use soratio instead!
714
715
    if ($self->{use_sliding_window}) {
716
      ($lo, $hi) = $self->sliding_window_ratio_to_range($rule->{soratio});
717
    } else {
718
      ($lo, $hi) = $self->shrinking_window_ratio_to_range($rule->{soratio});
719
    }
720
721
    # Modify good rules to be lower
722
    if ($rule->{isnice}) {
723
      if ($rule->{tflags} =~ /\blearn\b/) { # learn rules should get
724
                                            # higher scores (-5.4)
725
	$lo *= 1.8;
726
      }
727
      elsif ( $rule->{soratio} <= 0.05 && $rule->{ham_percent} > 0.5) {
728
	$lo *= 1.5;
729
      }
730
731
      # argh, ugly... but i'm copying it whole...
732
      $hi =	($rule->{soratio} == 0) ? $lo :
733
    		($rule->{soratio} <= 0.005 ) ? $lo/1.1 :
734
    		($rule->{soratio} <= 0.010 && $rule->{ham_percent} > 0.2) ? $lo/2.0 :
735
		($rule->{soratio} <= 0.025 && $rule->{ham_percent} > 1.5) ? $lo/10.0 :
736
		0;
737
738
      if ($rule->{soratio} >= 0.35 ) {
739
	($lo, $hi) = (0,0);
740
      }
741
    }
742
    else { # Make non-nice rules have higher scores if they're good
743
      if ($rule->{tflags} =~ /\blearn\b/ ) {
744
	$hi *= 1.8;
745
      }
746
      elsif ( $rule->{soratio} >= 0.99 && $rule->{spam_percent} > 1.0) {
747
	$hi *= 1.5;
748
      }
749
750
      $lo =	($rule->{soratio} == 1) ? $hi:
751
    		($rule->{soratio} >= 0.995 ) ? $hi/4.0 :
752
    		($rule->{soratio} >= 0.990 && $rule->{spam_percent} > 1.0) ? $hi/8.0 :
753
		($rule->{soratio} >= 0.900 && $rule->{spam_percent} > 10.0) ? $hi/24.0 :
754
		0;
755
756
      if ($rule->{soratio} <= 0.65 ) { # auto-disable bad rules
757
	($lo, $hi) = (0,0);
758
      }
759
    }
760
761
762
    # Some sanity checking
763
    if($hi < $lo) {
764
      ($lo, $hi) = ($hi, $lo);
765
    }
766
767
768
    $rule->{ismutable} = ($lo == $hi) ? 0 : 1;
769
    $rule->{range_lo} = $lo;
770
    $rule->{range_hi} = $hi;
771
772
  }
773
}
774
775
sub sliding_window_ratio_to_range {
776
  my ($self, $ratio) = @_;
777
778
  # (rough) graphic demo of this algorithm:
779
  # 0.0  = -limit [......] 0 ........ limit
780
  # 0.25 = -limit ..[..... 0 .]...... limit
781
  # 0.5  = -limit ....[... 0 ...].... limit
782
  # 0.75 = -limit ......[. 0 .....].. limit
783
  # 1.0  = -limit ........ 0 [......] limit
784
  my $sliding_window_limits = 4.8; # limits = [-$range, +$range]
785
  my $sliding_window_size =   5.5; # scores have this range within limits
786
787
  my $lo = -$sliding_window_limits + ($sliding_window_size * $ratio);
788
  my $hi = +$sliding_window_limits - ($sliding_window_size * (1-$ratio));
789
  if ($lo > $hi) {
790
    ($lo,$hi) = ($hi,$lo);
791
  }
792
  ($lo, $hi);
793
794
}
795
sub shrinking_window_ratio_to_range {
796
  my ($self, $ratio) = @_;
797
798
799
  # 0.0  = -limit [......] 0 ........ limit
800
  # 0.25 = -limit ....[... 0 ]....... limit
801
  # 0.5  = -limit ......[. 0 .]...... limit (note: tighter)
802
  # 0.75 = -limit .......[ 0 ...].... limit
803
  # 1.0  = -limit ........ 0 [......] limit
804
  my $shrinking_window_lower_base =   0.00; 
805
  my $shrinking_window_lower_range =  1.00; # *ratio, added to above
806
  my $shrinking_window_size_base =    1.00;
807
  my $shrinking_window_size_range =   1.00; # *ratio, added to above
808
809
  my $is_nice = 0;
810
  my $adjusted = ($ratio -.5) * 2;      # adj [0,1] to [-1,1]
811
  if ($adjusted < 0) { $is_nice = 1; $adjusted = -$adjusted; }
812
813
#$adjusted /= 1.5 if ( $ratio < 0.95 && $ratio > 0.15 ); # tvd
814
815
  my $lower = $shrinking_window_lower_base 
816
                        + ($shrinking_window_lower_range * $adjusted);
817
  my $range = $shrinking_window_size_base 
818
                        + ($shrinking_window_size_range * $adjusted);
819
  my $lo = $lower;
820
  my $hi = $lower + $range;
821
  if ($is_nice) {
822
    my $tmp = $hi; $hi = -$lo; $lo = -$tmp;
823
  }
824
  if ($lo > $hi) {
825
    ($lo,$hi) = ($hi,$lo);
826
  }
827
828
  ($lo, $hi);
829
}
830
831
832
# Pacify perl
833
1;
(-)hit-frequencies (-313 / +180 lines)
Lines 16-400 Link Here
16
# limitations under the License.
16
# limitations under the License.
17
# </@LICENSE>
17
# </@LICENSE>
18
18
19
19
use FindBin;
20
use FindBin;
20
use Getopt::Std;
21
use lib "$FindBin::Bin/../lib";
21
getopts("fm:M:X:l:L:pxhc:at:s:i");
22
use Mail::SpamAssassin::Masses;
23
use Getopt::Long qw(:config bundling auto_help);
24
use Pod::Usage;
25
use strict;
26
use warnings;
22
27
28
23
use vars qw {
29
use vars qw {
24
  $opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c
30
  $opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c
25
  $opt_a $opt_t $opt_s $opt_i $sorting
31
  $opt_a $opt_t $opt_s $opt_z $opt_inclang $opt_auto
26
};
32
};
27
33
28
sub usage {
34
GetOptions("c|cffile=s@" => \$opt_c,
29
  die "hit-frequencies [-c rules dir] [-f] [-m RE] [-M RE] [-X RE] [-l LC]
35
	   "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
30
                [-s SC] [-a] [-p] [-x] [-i] [spam log] [ham log]
36
	   "l|logfile=s" => \$opt_l,
37
	   "f|falses" => \$opt_f,
38
	   "a|all" => \$opt_a,
39
	   "p|percentages" => \$opt_p,
40
	   "x|extended" => \$opt_x,
41
	   "m|matchrule=s" => \$opt_m, #,
42
	   "t|tflags=s" => \$opt_t,
43
	   "M|matchlog=s" => \$opt_M,
44
	   "X|excludelog=s" => \$opt_X,
45
	   "L|language=s" => \$opt_L,
46
	   "include-language=s" => \$opt_inclang);
31
47
32
    -c p   use p as the rules directory
33
    -f     falses. count only false-negative or false-positive matches
34
    -m RE  print rules matching regular expression
35
    -t RE  print rules with tflags matching regular expression
36
    -M RE  only consider log entries matching regular expression
37
    -X RE  don't consider log entries matching regular expression
38
    -l LC  also print language specific rules for lang code LC (or 'all')
39
    -L LC  only print language specific rules for lang code LC (or 'all')
40
    -a     display all tests
41
    -p     percentages. implies -x
42
    -x     extended output, with S/O ratio and scores
43
    -s SC  which scoreset to use
44
    -i     use IG (information gain) for ranking
45
48
46
    options -l and -L are mutually exclusive.
49
=head1 NAME
47
50
48
    options -M and -X are *not* mutually exclusive.
51
hit-frequencies - Display statistics about tests hit by a mass-check run
49
52
50
    if either the spam or and ham logs are unspecified, the defaults
53
=head1 SYNOPSIS
51
    are \"spam.log\" and \"ham.log\" in the cwd.
52
54
53
";
55
hit-frequencies [options]
54
}
55
56
56
usage() if($opt_h || ($opt_l && $opt_L));
57
 Options:
58
    -c,--cffile=path	  Use path as the rules directory
59
    -s,--scoreset=n	  Use scoreset n
60
    -l,--logfile=file	  Read in file instead of masses.log
61
    -f			  Count only false-positives/false-negatives
62
    -a			  Report all tests (including subrules)
63
    -p			  Report percentages instead of raw hits
64
    -x			  "Extended" output, include RANK, S/O and SCORE
65
    -m,--matchrule=re     Print rules matching the regular expression
66
    -t,--tflags=re	  Print only rules with tflags matching the regular expression
67
    -M,--matchlog=re      Consider only logs matching the regular expression
68
    -X,--excludelog=re	  Exclude logs matching this regular expression
69
    -L,--language=lc	  Only print language specific tests for specified lang code (try 'all')
70
    --include-language=lc Also print language specific tests for specified lang code (try 'all')
57
71
58
if ($opt_p) {
72
=head1 DESCRIPTION
59
  $opt_x = 1;
60
}
61
73
62
$opt_s = 0 if ( !defined $opt_s );
74
B<hit-frequencies> will read the mass-check log F<masses.log> or the
75
log given by the B<--logfile> option. The output will contain a
76
summary of the number of ham and spam messages and detailed statistics
77
for each rule. By default, B<hit-frequencies> will try to guess the
78
proper values for B<--cffile> based on the header of the
79
masses.log. The output will include the following columns:
63
80
64
my $cffile = $opt_c || "$FindBin::Bin/../rules";
81
=over 4
65
82
66
my %freq_spam = ();
83
=item OVERALL
67
my %freq_ham = ();
68
my $num_spam = 0;
69
my $num_ham = 0;
70
my %ranking = ();
71
my $ok_lang = '';
72
84
73
readscores($cffile);
85
Number of times (or percentage with B<-p>) the rule hit on
86
all messages (spam or ham).
74
87
75
$ok_lang = lc ($opt_l || $opt_L || '');
88
=item SPAM
76
if ($ok_lang eq 'all') { $ok_lang = '.'; }
77
89
78
foreach my $key (keys %rules) {
90
Number of times (or percentage with B<-p>) the rule hit on
91
spam messages.
79
92
80
  if ( ($opt_L && !$rules{$key}->{lang}) ||
93
=item HAM
81
       ($rules{$key}->{lang} &&
82
         (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i)
83
     ) ) {
84
    delete $rules{$key} ; next;
85
  }
86
94
87
  $freq_spam{$key} = 0;
95
Number of times (or percentage with B<-p>) the rule hit on
88
  $freq_ham{$key} = 0;
96
ham messages.
89
}
90
97
91
readlogs();
98
=item S/O
92
99
93
my $hdr_all = $num_spam + $num_ham;
100
Shown only with B<-x> or B<-p>, this is the number of spam hits
94
my $hdr_spam = $num_spam;
101
divided by total number of hits (C<S/O> refers to spam divided by
95
my $hdr_ham = $num_ham;
102
overall).
96
103
97
if ($opt_p) {
104
=item RANK
98
  my $sorting = $opt_i ? "IG" : "RANK";
99
  if ($opt_f) {
100
    printf "%7s %7s %7s  %6s  %6s  %6s  %s\n",
101
  	"OVERALL%", "FNEG%", "FPOS%", "S/O", $sorting, "SCORE", "NAME";
102
  } else {
103
    printf "%7s %7s  %7s  %6s  %6s  %6s  %s\n",
104
  	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
105
  }
106
  printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
107
  	$hdr_all, $hdr_spam, $hdr_ham,
108
        soratio ($num_spam,$num_ham), 0, 0;
109
105
110
  $hdr_spam = ($num_spam / $hdr_all) * 100.0;
106
Shown only with B<-x> or B<-p>, this is a measure that attempts to
111
  $hdr_ham = ($num_ham / $hdr_all) * 100.0;
107
indicate how I<good> or I<useful> a test is. The higher it is, the
112
  $hdr_all = 100.0;             # this is obvious
108
better the test.
113
  printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  (all messages as %%)\n",
114
  	$hdr_all, $hdr_spam, $hdr_ham,
115
        soratio ($num_spam,$num_ham), 0, 0;
116
109
117
} elsif ($opt_x) {
110
=item SCORE
118
  printf "%7s %7s  %7s  %6s  %6s %6s  %s\n",
119
  	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
120
  printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  (all messages)\n",
121
  	$hdr_all, $hdr_spam, $hdr_ham,
122
        soratio ($num_spam,$num_ham), 0, 0;
123
111
124
} else {
112
Shown only with B<-x> or B<-p>, this is the current score assigned to
125
  printf "%10s  %10s  %10s  %s\n",
113
the rule.
126
  	"OVERALL", "SPAM", "HAM", "NAME";
127
  printf "%10d  %10d  %10d  (all messages)\n",
128
  	$hdr_all, $hdr_spam, $hdr_ham;
129
}
130
114
131
my %done = ();
115
=item NAME
132
my @tests = ();
133
my $rank_hi = 0;
134
my $rank_lo = 9999999;
135
116
136
# variables for wanted/unwanted RANK
117
This is the rule's name.
137
my %wanted;
138
my %unwanted;
139
my %wranks;
140
my %uranks;
141
118
142
foreach my $test (keys %freq_spam, keys %freq_ham) {
119
=back
143
  next unless (exists $rules{$test});           # only valid tests
144
  next if (!$opt_a && $rules{$test}->{issubrule});
145
120
146
  next if $done{$test}; $done{$test} = 1;
121
=head1 BUGS
147
  push (@tests, $test);
148
122
149
  my $isnice = 0;
123
Please report bugs to http://bugzilla.spamassassin.org/
150
  if ($rules{$test}->{tflags} =~ /nice/) { $isnice = 1; }
151
124
152
  my $fs = $freq_spam{$test}; $fs ||= 0;
125
=head1 SEE ALSO
153
  my $fn = $freq_ham{$test}; $fn ||= 0;
154
  my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
155
  my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;
156
126
157
  my $soratio = $soratio{$test} = soratio ($fsadj, $fnadj);
127
L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
158
128
159
  if ($isnice) {
129
=cut
160
    $soratio = 1.0 - $soratio;
161
    my $tmp = $fsadj; $fsadj = $fnadj; $fnadj = $tmp;
162
  }
163
130
164
  if ($opt_i) {
131
if ($opt_L && $opt_inclang) {
165
    # come up with a ranking
132
  pod2usage("-L/--language and --include-language are mutually exclusive");
166
    my $rank;
167
168
    # New new system: from "Learning to Filter Unsolicited Commercial E-Mail",
169
    # Ion Androutsopoulos et al: determine the information gain IG(X, C) of the
170
    # Boolean attributes (ie. the rules). Measures "the average reduction in
171
    # the entropy of C (classification) given the value of X (the rule)". Makes
172
    # a good ranking measure with a proper statistical basis. ;)
173
    #
174
    # Still would like to get an entropy measure in, too.
175
    #
176
    #             sum                                    P(X = x ^ C = c)
177
    # IG(X,C) = x in [0, 1]    P(X = x ^ C = c) . log2( ------------------- )
178
    #           c in [Ch, Cs]                           P(X = x) . P(C = c)
179
    #
180
    my $safe_nspam = $num_spam || 0.0000001;
181
    my $safe_nham = $num_ham || 0.0000001;
182
183
    my $num_all = ($num_spam + $num_ham);
184
    my $safe_all = $num_all || 0.0000001;
185
    my $f_all = $fs+$fn;
186
187
    my $px0 = (($num_all - $f_all) / $safe_all);         # P(X = 0)
188
    my $px1 = ($f_all / $safe_all);                      # P(X = 1)
189
    my $pccs = ($num_spam / $safe_all);                  # P(C = Cs)
190
    my $pcch = ($num_ham / $safe_all);                   # P(C = Ch)
191
    my $px1ccs = ($fs / $safe_nspam);                   # P(X = 1 ^ C = Cs)
192
    my $px1cch = ($fn / $safe_nham);                    # P(X = 1 ^ C = Ch)
193
    my $px0ccs = (($num_spam - $fs) / $safe_nspam);     # P(X = 0 ^ C = Cs)
194
    my $px0cch = (($num_ham - $fn) / $safe_nham);       # P(X = 0 ^ C = Ch)
195
    my $safe_px0_dot_pccs = ($px0 * $pccs) || 0.00000001;
196
    my $safe_px0_dot_pcch = ($px0 * $pcch) || 0.00000001;
197
    my $safe_px1_dot_pccs = ($px1 * $pccs) || 0.00000001;
198
    my $safe_px1_dot_pcch = ($px1 * $pcch) || 0.00000001;
199
200
    sub log2 { return log($_[0]) / 0.693147180559945; } # log(2) = 0.6931...
201
202
    my $safe_px0ccs = ($px0ccs || 0.0000001);
203
    my $safe_px0cch = ($px0cch || 0.0000001);
204
    my $safe_px1ccs = ($px1ccs || 0.0000001);
205
    my $safe_px1cch = ($px1cch || 0.0000001);
206
    $rank = ( $px0ccs * log2($safe_px0ccs / $safe_px0_dot_pccs) ) +
207
                    ( $px0cch * log2($safe_px0cch / $safe_px0_dot_pcch) ) +
208
                    ( $px1ccs * log2($safe_px1ccs / $safe_px1_dot_pccs) ) +
209
                    ( $px1cch * log2($safe_px1cch / $safe_px1_dot_pcch) );
210
211
    $ranking{$test} = $rank;
212
    $rank_hi = $rank if ($rank > $rank_hi);
213
    $rank_lo = $rank if ($rank < $rank_lo);
214
  }
215
  else {
216
    # basic wanted/unwanted ranking
217
    $wanted{$test} = $isnice ? $fn : $fs;
218
    $unwanted{$test} = $isnice ? $fs : $fn;
219
    # count number of ranks of each type
220
    $wranks{$wanted{$test}} = 1;
221
    $uranks{$unwanted{$test}} = 1;
222
  }
223
}
133
}
224
134
225
# finish basic wanted/unwanted ranking
135
if ($opt_p) {
226
if (! $opt_i) {
136
  $opt_x = 1;
227
  my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
228
  my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;
229
230
  # first half of ranking is the wanted rank
231
  my $position = 0;
232
  my $last = undef;
233
  for my $test (@wanted) {
234
    $position++ if defined $last && $last != $wanted{$test};
235
    $ranking{$test} += $position;
236
    $last = $wanted{$test}
237
  }
238
239
  # second half of ranking is the unwanted rank
240
  my $normalize = (scalar keys %wranks) / (scalar keys %uranks);
241
  $position = 0;
242
  $last = undef;
243
  for my $test (@unwanted) {
244
    $position++ if defined $last && $last != $unwanted{$test};
245
    $ranking{$test} += ($position * $normalize);
246
    $last = $unwanted{$test};
247
    $rank_hi = $ranking{$test} if ($ranking{$test} > $rank_hi);
248
    $rank_lo = $ranking{$test} if ($ranking{$test} < $rank_lo);
249
  }
250
}
137
}
251
138
252
{
139
$opt_s = 0 if ( !defined $opt_s );
253
  # now normalise the rankings to [0, 1]
254
  $rank_hi -= $rank_lo;
255
  foreach $test (@tests) {
256
    $ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi);
257
  }
258
}
259
140
260
foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) {
141
my $ok_lang = lc ( $opt_inclang || $opt_L || '');
261
  next unless (exists $rules{$test});           # only valid tests
142
$ok_lang = '.' if ($ok_lang eq 'all');
262
  next if (!$opt_a && $rules{$test}->{issubrule});
263
143
264
  my $fs = $freq_spam{$test}; $fs ||= 0;
144
my $greprules = sub { # To determine whether rule should be read
265
  my $fn = $freq_ham{$test}; $fn ||= 0;
145
  my ($name, $rule) = @_;
266
  my $fa = $fs+$fn;
267
146
268
  next if ($opt_m && $test !~ m/$opt_m/);	# match certain tests
147
  return 0 if ($opt_m && $name !~ /$opt_m/); # name doesn't match -m
269
  next if ($opt_t && $rules{$test}->{tflags} !~ /$opt_t/); # match tflags
148
                                             # expression
149
  return 0 if ($opt_t && $rule->{tflags} !~ /$opt_t/); # tflags don't
150
                                                       # match -t
151
                                                       # expression
152
  return 0 if (($opt_L && !$rule->{lang}) ||
153
	   ($rule->{lang} &&
154
	    (!$ok_lang || $rule->{lang} !~ /^$ok_lang/i))); # Wrong language
270
155
156
  return 0 if ($rule->{issubrule} && !$opt_a);
157
271
  if (!$opt_a && !$opt_t) {
158
  if (!$opt_a && !$opt_t) {
272
    next if ($rules{$test}->{tflags} =~ /net/ && ($opt_s % 2 == 0));   # not net tests
159
    return 0 if ($rule->{tflags} =~ /net/ && ($opt_s % 2 == 0));
273
    next if ($rules{$test}->{tflags} =~ /userconf/); # or userconf
160
    return 0 if ($rule->{tflags} =~ /userconf/); # or userconf
274
  }
161
  }
162
  return 1;
275
163
276
  # adjust based on corpora sizes (and cvt to % while we're at it)
164
};
277
  my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
278
  my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;
279
165
280
  if ($opt_f && $fsadj == 0 && $fnadj == 0) { next; }
281
166
282
  if ($opt_p) {
167
my $logfile = $opt_l || "masses.log";
283
    $fa = ($fa / ($num_spam + $num_ham)) * 100.0;
284
    $fs = $fsadj;
285
    $fn = $fnadj;
286
  }
287
168
288
  my $soratio = $soratio{$test};
169
if (!$opt_c || !scalar(@$opt_c)) {
289
  if (!defined $soratio) {
170
    # Try to read this in from the log, if possible
290
    $soratio{$test} = soratio ($fsadj, $fnadj);
171
    open IN, $logfile or die "Can't open $logfile: $!";
291
  }
172
    my $files = 0; # are we in the files section?
173
    while(<IN>) {
174
	if (!$files) {
175
	    if (/^\# SVN revision:/) {
176
		$opt_c = [ "$FindBin::Bin/../rules" ];
177
		last;
178
	    } elsif (/^\# Using configuration:$/) {
179
		$files = 1;
180
	    }
181
	} elsif (/^\#\s+(.*)\s*$/) {
182
	    push (@$opt_c, $1);
183
	} else {
184
	    # All done!
185
	    last;
186
	}
187
    }
292
188
293
  if ($opt_p) {
189
    foreach my $file (@$opt_c) {
294
    printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  %s\n",
190
	die "Can't read $file" unless -r $file;
295
  	$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test;
191
    }
296
297
  } elsif ($opt_x) {
298
    printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  %s\n",
299
  	$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test;
300
301
  } else {
302
    printf "%10d  %10d  %10d  %s\n", $fa, $fs, $fn, $test;
303
  }
304
}
192
}
305
exit;
193
	    
194
my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
195
					       scoreset => $opt_s,
196
                                               falsesonly => $opt_f,
197
                                               greprules => $greprules,
198
                                               logfile => $logfile,
199
                                               nologs => 1});
306
200
201
$masses->readrules();
202
$masses->readlogs();
203
$masses->do_statistics();
204
$masses->do_rank();
307
205
206
my $rules = $masses->get_rules_hash();
207
my $num_ham = $masses->get_num_ham();
208
my $num_spam = $masses->get_num_spam();
209
my $num_all = $num_ham + $num_spam;
308
210
309
sub readlogs {
211
if ($num_ham + $num_spam <= 0) {
310
  my $spam = $ARGV[0] || "spam.log";
212
  die "Can't run hit-frequencies on 0 messages.";
311
  my $ham = $ARGV[1] || (-f "good.log" ? "good.log" : "ham.log");
213
}
312
214
313
  foreach my $file ($spam, $ham) {
215
## Write header
314
    open (IN, "<$file") || die "Could not open file '$file': $!";
315
216
316
    my $isspam = 0; ($file eq $spam) and $isspam = 1;
217
if ($opt_p) {
317
218
318
    while (<IN>) {
219
  if ($opt_f) {
319
      next if (/^#/);
220
    printf "%7s %7s %7s  %6s  %6s  %6s  %s\n",
320
      next unless (!$opt_M || /$opt_M/o);
221
  	"OVERALL%", "FNEG%", "FPOS%", "S/O", "RANK", "SCORE", "NAME";
321
      next if ($opt_X && /$opt_X/o);
222
  } else {
223
    printf "%7s %7s  %7s  %6s  %6s  %6s  %s\n",
224
  	"OVERALL%", "SPAM%", "HAM%", "S/O", "RANK", "SCORE", "NAME";
225
  }
322
226
323
      /^(.)\s+(-?\d+)\s+(\S+)\s*(\S*)/ or next;
227
  printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
324
      my $caught = ($1 eq 'Y');
228
  	$num_all, $num_spam, $num_ham,
325
      my $hits = $2;
229
        $num_spam / $num_all, 0, 0;
326
      $_ = $4; s/,,+/,/g;
327
230
328
      if ($isspam) {
231
  printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  (all messages as %%)\n",
329
        if ($opt_f) {
232
  	100.0, $num_spam / $num_all * 100.0, $num_ham / $num_all * 100.0,
330
          if (!$caught) { $num_spam++; }
233
        $num_spam / $num_all, 0, 0;
331
        } else {
332
          $num_spam++;
333
        }
334
      } else {
335
        if ($opt_f) {
336
          if ($caught) { $num_ham++; }
337
        } else {
338
          $num_ham++;
339
        }
340
      }
341
234
342
      my @tests = split (/,/, $_);
235
} elsif ($opt_x) {
343
      foreach my $t (@tests) {
236
  printf "%7s %7s  %7s  %6s  %6s %6s  %s\n",
344
	next if ($t eq '');
237
  	"OVERALL", "SPAM", "HAM", "S/O", "RANK", "SCORE", "NAME";
345
	if ($isspam) {
238
  printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  (all messages)\n",
346
          if ($opt_f) {
239
  	$num_all, $num_spam, $num_ham,
347
            if (!$caught) { $freq_spam{$t}++; }
240
        $num_spam / $num_all, 0, 0;
348
          } else {
349
            $freq_spam{$t}++;
350
          }
351
	} else {
352
          if ($opt_f) {
353
            if ($caught) { $freq_ham{$t}++; }
354
          } else {
355
            $freq_ham{$t}++;
356
          }
357
	}
358
      }
359
    }
360
    close IN;
361
  }
362
}
363
241
364
242
} else {
365
sub readscores {
243
  printf "%10s  %10s  %10s  %s\n",
366
  my($cffile) = @_;
244
  	"OVERALL", "SPAM", "HAM", "NAME";
367
  system ("$FindBin::Bin/parse-rules-for-masses -d \"$cffile\" -s $opt_s") and die;
245
  printf "%10d  %10d  %10d  (all messages)\n",
368
  require "./tmp/rules.pl";
246
  	$num_all, $num_spam, $num_ham;
369
}
247
}
370
248
371
sub soratio {
249
foreach my $test (sort { $rules->{$b}->{rank} <=> $rules->{$a}->{rank} } keys %{$rules}) {
372
  my ($s, $n) = @_;
373
250
374
  $s ||= 0;
251
  if ($opt_p) {
375
  $n ||= 0;
252
    printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  %s\n",
376
253
  	($rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham}) / $num_all * 100.0,
377
  if ($s + $n > 0) {
254
        $rules->{$test}->{spam_percent}, $rules->{$test}->{ham_percent},
378
      return $s / ($s + $n);
255
        $rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test;
256
  } elsif ($opt_x) {
257
    printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  %s\n",
258
  	$rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham},
259
        $rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham},
260
        $rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test;
379
  } else {
261
  } else {
380
      return 0.5;		# no results -> not effective
262
    printf "%10d  %10d  %10d  %s\n",
263
        $rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham},
264
        $rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham}, $test;
381
  }
265
  }
382
}
266
}
383
267
384
sub tcr {
385
  my ($nspam, $nlegit, $nspamspam, $nlegitspam) = @_;
386
  my $nspamlegit = $nspam - $nspamspam;
387
  my $nlegitlegit = $nlegit - $nlegitspam;
388
389
  my $lambda = 99;
390
391
  my $werr = ($lambda * $nlegitspam + $nspamlegit)
392
                  / ($lambda * $nlegit + $nspam);
393
394
  my $werr_base = $nspam
395
                  / ($lambda * $nlegit + $nspam);
396
397
  $werr ||= 0.000001;     # avoid / by 0
398
  my $tcr = $werr_base / $werr;
399
  return $tcr;
400
}
(-)rewrite-cf-with-new-scores (-21 / +110 lines)
Lines 16-47 Link Here
16
# limitations under the License.
16
# limitations under the License.
17
# </@LICENSE>
17
# </@LICENSE>
18
18
19
=head1 NAME
20
21
rewrite-cf-with-new-scores - Rewrite SpamAssassin scores file with new
22
scores.
23
24
=head1 SYNOPSIS
25
26
rewrite-cf-with-new-scores [options]
27
28
  Options
29
  --old-scores=file    Read file containing the old SpamAssassin scores
30
  --new-scores=file    Read file containing the new SpamAssassin scores
31
  -s,--scoreset n      Rewrite scoreset n
32
  --output=file        Output rewritten score file to file
33
  -c,--cffile=path     Use path as the rules directory
34
  -l,--logfile=file    Use file instead of masses.log (for guessing -c)
35
36
 Note: these options can be shortened (i.e. --old, --new, --out) as
37
 long as they are unambiguous.
38
39
=head1 DESCRIPTION
40
41
B<rewrite-cf-with-new-scores> is a tool to update the sitewide scores
42
file with the newly generated scores. Since SpamAssassin has four
43
different scoresets, which each need to be generated separately, this
44
tool is used to only change the correct scoreset.
45
46
By default, the old scores are read from 50_scores.cf in the rules
47
directory and the new ones from ./perceptron.scores. The output will
48
be ./50_scores.cf by default.
49
50
The rules directory needs to be used to make sure scores are given for
51
the right tests. Rules not found in the rules directory will not be
52
given scores in the output.
53
54
=head1 BUGS
55
56
Please report bugs to http://bugzilla.spamassassin.org/
57
58
=head1 SEE ALSO
59
60
L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
61
62
=cut
63
64
use FindBin;
65
use lib "$FindBin::Bin/../lib";
66
use Getopt::Long qw(:config bundling auto_help);
67
use Mail::SpamAssassin::Masses;
68
use Pod::Usage;
69
use strict;
70
use warnings;
71
72
use vars qw($opt_old $opt_new $opt_scoreset $opt_out $opt_c $opt_l);
73
74
GetOptions("old-scores=s" => \$opt_old,
75
	   "new-scores=s" => \$opt_new,
76
	   "s|scoreset=i" => \$opt_scoreset,
77
	   "output=s" => \$opt_out,
78
	   "c|cffile=s@" => \$opt_c,
79
	   "l|logfile=s" => \$opt_l);
80
81
$opt_l ||= "masses.log";
82
$opt_scoreset = 0 unless defined $opt_scoreset;
83
19
my $NUM_SCORESETS = 4;
84
my $NUM_SCORESETS = 4;
20
85
21
my ($scoreset,$oldscores,$newscores) = @ARGV;
86
if (!$opt_c || !scalar(@$opt_c)) {
87
    # Try to read this in from the log, if possible
88
    open IN, $opt_l or die "Can't open $opt_l: $!";
89
    my $files = 0; # are we in the files section?
90
    while(<IN>) {
91
	if (!$files) {
92
	    if (/^\# SVN revision:/) {
93
		$opt_c = [ "$FindBin::Bin/../rules" ];
94
		last;
95
	    } elsif (/^\# Using configuration:$/) {
96
		$files = 1;
97
	    }
98
	} elsif (/^\#\s+(.*)\s*$/) {
99
	    push (@$opt_c, $1);
100
	} else {
101
	    # All done!
102
	    last;
103
	}
104
    }
22
105
23
$scoreset = int($scoreset) if defined $scoreset;
106
    foreach my $file (@$opt_c) {
24
if (!defined $newscores || $scoreset < 0 || $scoreset >= $NUM_SCORESETS ) {
107
	die "Can't read $file" unless -r $file;
25
  die "usage: rewrite-cf-with-new-scores scoreset oldscores.cf newscores.cf\n";
108
    }
26
}
109
}
27
110
28
system ("./parse-rules-for-masses -s $scoreset") and die;
111
if (!$opt_old) {
29
if (-e "tmp/rules.pl") {
112
  $opt_old = $$opt_c[0] . "/50_scores.cf";
30
  # Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem
31
  require "./tmp/rules.pl";
32
}
113
}
33
else {
34
  die "parse-rules-for-masses had no error but no tmp/rules.pl!?!";
35
}
36
114
115
$opt_new ||= "50_scores.cf";
116
117
my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
118
					       scoreset => $opt_scoreset});
119
120
$masses->readrules();
121
my $rules = $masses->get_rules_hash();
122
37
# now read the generated scores
123
# now read the generated scores
38
my @gascoreorder = ();
124
my @gascoreorder = ();
125
my %oldscores = ();
39
my %gascorelines = ();
126
my %gascorelines = ();
40
open (STDIN, "<$newscores") or die "cannot open $newscores";
127
open (STDIN, "<$opt_new") or die "cannot open $opt_new";
41
while (<STDIN>) {
128
while (<STDIN>) {
42
  /^score\s+(\S+)\s+(-?\d+(?:\.\d+)?)/ or next;
129
  /^score\s+(\S+)\s+(-?\d+(?:\.\d+)?)/ or next;
43
  my $name = $1;  my $score = $2;
130
  my $name = $1;  my $score = $2;
44
  next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);
131
  next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule});
45
  next if ($name =~ /^__/);
132
  next if ($name =~ /^__/);
46
  next if ($name eq '(null)');	# er, oops ;)
133
  next if ($name eq '(null)');	# er, oops ;)
47
134
Lines 49-55 Link Here
49
  push (@gascoreorder, $name);
136
  push (@gascoreorder, $name);
50
}
137
}
51
138
52
open (IN, "<$oldscores") or die "cannot open $oldscores";
139
open (IN, "<$opt_old") or die "cannot open $opt_old";
53
my $out = '';
140
my $out = '';
54
my $pre = '';
141
my $pre = '';
55
142
Lines 58-64 Link Here
58
while (<IN>) {
145
while (<IN>) {
59
  if (/^\s*score\s+(\S+)\s/) {
146
  if (/^\s*score\s+(\S+)\s/) {
60
    delete $gascorelines{$1};
147
    delete $gascorelines{$1};
61
    next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);
148
    next unless (exists ($rules->{$1}) && $rules->{$1}->{issubrule} == 0);
62
  }
149
  }
63
  $pre .= $_;
150
  $pre .= $_;
64
  /^# Start of generated scores/ and last;
151
  /^# Start of generated scores/ and last;
Lines 82-91 Link Here
82
  if (/^\s*score\s+\S+/) {
169
  if (/^\s*score\s+\S+/) {
83
    my($score,$name,@scores) = split;
170
    my($score,$name,@scores) = split;
84
171
85
    next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);
172
    next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule});
86
    if (defined $gascorelines{$name}) {
173
    if (defined $gascorelines{$name}) {
87
      # Set appropriate scoreset value
174
      # Set appropriate scoreset value
88
      $scores[$scoreset] = $gascorelines{$name};
175
      $scores[$opt_scoreset] = $gascorelines{$name};
89
176
90
      # Create new score line
177
      # Create new score line
91
      $_ = join(" ","score",$name,generate_scores(@scores))."\n";
178
      $_ = join(" ","score",$name,generate_scores(@scores))."\n";
Lines 96-103 Link Here
96
}
183
}
97
close IN;
184
close IN;
98
185
186
open OUT, ">$opt_out" or die "Can't open $opt_out: $!";
187
99
# and output the lot
188
# and output the lot
100
print $pre, "\n";
189
print OUT $pre, "\n";
101
foreach my $name (@gascoreorder) {
190
foreach my $name (@gascoreorder) {
102
  $_ = $gascorelines{$name};
191
  $_ = $gascorelines{$name};
103
  next unless (defined ($_));
192
  next unless (defined ($_));
Lines 107-118 Link Here
107
  @scores = @{$oldscores{$name}} if ( exists $oldscores{$name} );
196
  @scores = @{$oldscores{$name}} if ( exists $oldscores{$name} );
108
197
109
  # Set appropriate scoreset value
198
  # Set appropriate scoreset value
110
  $scores[$scoreset] = $_;
199
  $scores[$opt_scoreset] = $_;
111
200
112
  # Create new score line
201
  # Create new score line
113
  print join(" ","score",$name,generate_scores(@scores)),"\n";
202
  print OUT join(" ","score",$name,generate_scores(@scores)),"\n";
114
}
203
}
115
print "\n", $out, "\n";
204
print OUT "\n", $out, "\n";
116
205
117
sub generate_scores {
206
sub generate_scores {
118
  my (@scores) = @_;
207
  my (@scores) = @_;
(-)Makefile (-15 / +8 lines)
Lines 4-36 Link Here
4
4
5
# What rule scoreset are we using?
5
# What rule scoreset are we using?
6
SCORESET =	0
6
SCORESET =	0
7
LOGFILE =	masses.log
7
8
8
#### Should be no need to modify below this line
9
#### Should be no need to modify below this line
9
10
10
all: badrules perceptron
11
all: badrules perceptron
11
12
12
perceptron: perceptron.o
13
perceptron: perceptron.o
13
	$(CC) -o perceptron perceptron.o $(LDFLAGS)
14
	$(CC) -o perceptron perceptron.o $(LDFLAGS) 
14
15
15
perceptron.o: tmp/rules.pl tmp/tests.h tmp/scores.h
16
perceptron.o: tmp/tests.h
16
	$(CC) $(CFLAGS) -c -o perceptron.o perceptron.c
17
	$(CC) $(CFLAGS) -c -o perceptron.o perceptron.c
17
18
18
tmp/rules.pl: tmp/.created parse-rules-for-masses
19
tmp/tests.h: tmp/.created logs-to-c
19
	perl parse-rules-for-masses -d ../rules -s $(SCORESET)
20
	perl logs-to-c --scoreset=$(SCORESET) --logfile=$(LOGFILE)
20
21
21
tmp/tests.h: tmp/.created tmp/ranges.data logs-to-c
22
freqs: masses.log
22
	perl logs-to-c --scoreset=$(SCORESET)
23
	perl hit-frequencies -x -p -s $(SCORESET) --logfile=$(LOGFILE) > freqs
23
24
24
tmp/scores.h: tmp/tests.h
25
26
tmp/ranges.data: tmp/.created freqs score-ranges-from-freqs
27
	perl score-ranges-from-freqs ../rules $(SCORESET) < freqs
28
29
freqs: spam.log ham.log
30
	perl hit-frequencies -x -p -s $(SCORESET) > freqs
31
32
badrules: freqs
25
badrules: freqs
33
	perl lint-rules-from-freqs < freqs > badrules
26
	perl lint-rules-from-freqs -s $(SCORESET) --logfile=$(LOGFILE) > badrules
34
27
35
tmp/.created:
28
tmp/.created:
36
	-mkdir tmp
29
	-mkdir tmp
(-)lint-rules-from-freqs (-244 / +167 lines)
Lines 16-139 Link Here
16
# limitations under the License.
16
# limitations under the License.
17
# </@LICENSE>
17
# </@LICENSE>
18
18
19
=head1 NAME
20
21
lint-rules-from-freqs - Try to find problems with SpamAssassin rules
22
23
=head1 SYNOPSIS
24
25
lint-rules-from-freqs [options]
26
27
 Options:
28
    -c,--cffile=path	  Use path as the rules directory
29
    -s,--scoreset=n	  Use scoreset n
30
    -l,--logfile=file	  Read in file instead of masses.log
31
    -f			  Also take into account false positives/negatives
32
33
=head1 DESCRIPTION
34
35
This script analyzes SpamAssassin tests, based on the hit frequencies
36
and S/O ratios from a mass-check log (masses.log).  This script can
37
also optionally take into account the false positive/negative
38
frequencies.
39
40
The script first uses the SpamAssassin rules parser to report on any
41
illegal syntax. Then it checks the rules match frequencies from the
42
mass-check log in order to determine how effective the rule is.
43
44
=head1 BUGS
45
46
Please report bugs to http://bugzilla.spamassassin.org/
47
48
=head1 SEE ALSO
49
50
L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
51
52
=cut
53
54
55
use FindBin;
56
use lib "$FindBin::Bin/../lib";
57
use Mail::SpamAssassin::Masses;
58
use Mail::SpamAssassin;
59
use Getopt::Long qw(:config bundling auto_help);
60
use strict;
61
use warnings;
62
19
# any tests that get less than this % of matches on *both* spam or nonspam, are
63
# any tests that get less than this % of matches on *both* spam or nonspam, are
20
# reported.
64
# reported.
21
my $LOW_MATCHES_PERCENT = 0.03;
65
my $LOW_MATCHES_PERCENT = 0.03;
22
my $scoreset = 0;
23
66
67
use vars qw($opt_c $opt_l $opt_s $opt_f $opt_p);
68
69
GetOptions("c|cffile=s@" => \$opt_c,
70
	   "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
71
	   "l|logfile=s" => \$opt_l,
72
	   "f|falses" => \$opt_f);
73
74
75
$opt_s = 0 unless defined $opt_s;
76
$opt_l ||= "masses.log";
77
24
sub usage {
78
sub usage {
25
  die "
79
  die "
26
lint-rules-from-freqs: perform 'lint' testing on SpamAssassin rules and scores
80
lint-rules-from-freqs: perform 'lint' testing on SpamAssassin rules and scores
27
81
28
usage: ./lint-rules-from-freqs [-f falsefreqs] < freqs > badtests
82
usage: ./lint-rules-from-freqs [-c rules dir] [-l logfile] [-s scoreset] [-f]
29
83
30
This analyzes SpamAssassin tests, based on the hit frequencies and S/O ratios
84
 -c p  Use p as rules directory
31
from a mass-check logfile pair.
85
 -l f  Use f as log fine
86
 -s n  Use n as score set
87
 -f    Check also for false positives/negatives
32
88
33
The 'freqs' argument is the frequency of hits in all messages ('hit-frequencies
34
-x -p' output).
35
89
36
The 'falsefreqs' argument is frequencies of hits in false-positives and
37
false-negatives only ('hit-frequencies -x -p -f' output).
38
39
";
90
";
40
}
91
}
41
92
42
my $opt_falsefreqs;
93
43
while ($#ARGV >= 0) {
94
if (!$opt_c || !scalar(@$opt_c)) {
44
  $_ = shift @ARGV;
95
    # Try to read this in from the log, if possible
45
  if (/^-f/) { $_ = shift @ARGV; $opt_falsefreqs = $_; }
96
    open IN, $opt_l or die "Can't open $opt_l: $!";
46
  elsif (/^-s/) { $_ = shift @ARGV; $scoreset = $_; }
97
    my $files = 0; # are we in the files section?
47
  else { usage(); }
98
    while(<IN>) {
99
	if (!$files) {
100
	    if (/^\# SVN revision:/) {
101
		$opt_c = [ "$FindBin::Bin/../rules" ];
102
		last;
103
	    } elsif (/^\# Using configuration:$/) {
104
		$files = 1;
105
	    }
106
	} elsif (/^\#\s+(.*)\s*$/) {
107
	    push (@$opt_c, $1);
108
	} else {
109
	    # All done!
110
	    last;
111
	}
112
    }
113
114
    foreach my $file (@$opt_c) {
115
	die "Can't read $file" unless -r $file;
116
    }
48
}
117
}
49
118
50
print "BAD TESTS REPORT\n";
119
print "BAD TESTS REPORT\n";
51
readrules();
120
# First, do a --lint
52
print "\n" .((scalar keys %rulefile) + 1). " rules found.\n";
121
53
print "\nRule file syntax issues:\n\n";
122
print "\nRule file syntax issues:\n\n";
54
lintrules();
55
123
56
if ($opt_falsefreqs) {
124
{
57
  open (FALSE, "<$opt_falsefreqs");
125
  local (*STDERR) = \*STDOUT; # Get lint errors on STDOUT
58
  while (<FALSE>) {
126
59
    if (!/^\s*([\d\.]+)/) {
127
  # Read the config ourselves...
60
      my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
128
61
      next unless ($name =~ /\S/);
129
  # Read init.pre from each directory, then glob for the rest.
62
      $falsefreqs_spam{$name} = $spam;
130
63
      $falsefreqs_nons{$name} = $nons;
131
  my $cf_txt = '';
64
      $falsefreqs_so{$name} = $so;
132
  my @files;
133
  my @dirs;
134
  foreach my $file (@$opt_c) {
135
    if (-d $file) {
136
      if  (-r "$file/init.pre") {
137
	push @files, "$file/init.pre";
138
      }
139
      push @dirs, $file;
65
    }
140
    }
141
    else {
142
      push @files, $file;
143
    }
66
  }
144
  }
67
  close FALSE;
145
  foreach my $dir (@dirs) {
68
}
146
    my @cfs = glob("$dir/*.cf");
147
    push @files, grep { -r $_ } @cfs;
148
  }
69
149
70
while (<>) {
150
  foreach my $file (@files) {
71
  if (!/^\s*([\d\.]+)/) {
151
    if (-r $file) {
72
    $output{'a_header'} = $_; next;
152
      open IN, $file;
153
      $cf_txt .= "file start $file\n";
154
      $cf_txt .= join('', <IN>);
155
      $cf_txt .= "\nfile end $file\n";
156
      close IN;
157
    }
73
  }
158
  }
74
159
160
  my $spamtest = new Mail::SpamAssassin({config_text => $cf_txt});
161
162
  $spamtest->lint_rules();
163
}
164
165
166
# Next, check for other stuff
167
my $masses = Mail::SpamAssassin::Masses->new({rulesdir => $opt_c,
168
					      scoreset => $opt_s, #,,
169
					      falses => $opt_f,
170
					      logfile => $opt_l});
171
172
$masses->readlogs();
173
$masses->do_statistics();
174
175
my $rules = $masses->get_rules_array();
176
177
178
my %output;
179
180
foreach my $rule (@$rules) {
181
75
  my $badrule;
182
  my $badrule;
76
  my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
77
  next unless ($name =~ /\S/);
78
183
79
  my $ffspam = $falsefreqs_spam{$name};
184
  next if ($rule->{tflags} =~ /\bnet\b/ && ($opt_s % 2) == 0);
80
  my $ffnons = $falsefreqs_nons{$name};
185
  next if ($rule->{tflags} =~ /\buserconf\b/);
81
  my $ffso = $falsefreqs_so{$name};
82
186
83
  my $tf = $tflags{$name};
187
  if ($rule->{freq_spam} == 0 && $rule->{freq_ham} == 0) {        # sanity!
84
  next if ($tf =~ /net/ && ($scoreset % 2) == 0);
85
  next if ($tf =~ /userconf/);
86
188
87
  if ($overall == 0.0 && $spam == 0.0 && $nons == 0.0) {        # sanity!
88
    $badrule = 'no matches';
189
    $badrule = 'no matches';
89
190
90
  } else {
191
  } else {
91
    if ($score < 0.0) {
192
    if ($rule->{score} < 0.0) {
92
      # negative score with more spams than nonspams? bad rule.
193
      # negative score with more spams than nonspams? bad rule.
93
      if ($tf !~ /nice/ && $so > 0.5 && $score < 0.5) {
194
      if (!$rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) {
94
        $badrule = 'non-nice but -ve score';
195
        $badrule = 'non-nice but -ve score';
95
      }
196
      }
96
197
      if ($rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) {
97
      if ($tf =~ /nice/ && $so > 0.5 && $score < 0.5) {
198
        if ($opt_f && $rule->{freq_fn} < $rule->{freq_fp}) {
98
        if ($ffso < 0.5) {
99
          $badrule = 'fn';
199
          $badrule = 'fn';
100
        } else {
101
          # ignore, the FNs are overridden by other tests so it doesn't
102
          # affect the overall results.
103
        }
200
        }
201
        # else {
202
        # ignore, the FNs are overridden by other tests so it doesn't
203
        # affect the overall results.
204
        # }
104
      }
205
      }
105
206
106
      # low number of matches overall
207
      # low number of matches overall
107
      if ($nons < $LOW_MATCHES_PERCENT) 
208
      if ($rule->{ham_percent} < $LOW_MATCHES_PERCENT)
108
                 { $badrule ||= ''; $badrule .= ', low matches'; }
209
                 { $badrule ||= ''; $badrule .= ', low matches'; }
109
210
110
    } elsif ($score > 0.0) {
211
    } elsif ($rule->{score} > 0.0) {
111
      # positive score with more nonspams than spams? bad.
212
      # positive score with more nonspams than spams? bad.
112
      if ($tf =~ /nice/ && $so < 0.5 && $score > 0.5) {
213
      if ($rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) {
113
        $badrule = 'nice but +ve score';
214
        $badrule = 'nice but +ve score';
114
      }
215
      }
115
216
 
116
      if ($tf !~ /nice/ && $so < 0.5 && $score > 0.5) {
217
      if (!$rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) {
117
        if ($ffso > 0.5) {
218
        if ($opt_f && $rule->{freq_fp} > $rule->{freq_fn}) {
118
          $badrule = 'fp';
219
          $badrule = 'fp';
119
        } else {
120
          # ignore, the FPs are overridden by other tests so it doesn't
121
          # affect the overall results.
122
        }
220
        }
221
        # else {
222
        # ignore, the FPs are overridden by other tests so it doesn't
223
        # affect the overall results.
224
        # }
123
      }
225
      }
124
226
 
125
      # low number of matches overall
227
      # low number of matches overall
126
      if ($spam < $LOW_MATCHES_PERCENT) 
228
      if ($rule->{spam_percent} < $LOW_MATCHES_PERCENT)
127
                 { $badrule ||= ''; $badrule .= ', low matches'; }
229
                 { $badrule ||= ''; $badrule .= ', low matches'; }
128
230
 
129
    } elsif ($score == 0.0) {
231
    } elsif ($rule->{score} == 0.0) {
130
      $badrule = 'score is 0';
232
      $badrule = 'score is 0';
131
    }
233
    }
132
  }
234
  }
133
235
 
134
  if (defined $badrule) {
236
  if (defined $badrule) {
135
    $badrule =~ s/^, //; chomp;
237
    $badrule =~ s/^, //;
136
    $output{$badrule} .= $_ . " ($badrule)\n";
238
    $output{$badrule} .= $rule->{name} . " ($badrule)\n";
137
  }
239
  }
138
}
240
}
139
241
Lines 156-337 Link Here
156
exit;
258
exit;
157
259
158
260
159
sub concat_rule_lang {
160
  my $rule = shift;
161
  my $lang = shift;
162
163
  if (defined $lang && $lang ne '') {
164
    return "[$lang]_$rule";
165
  } else {
166
    return $rule;
167
  }
168
}
169
170
# note: do not use parse-rules-for-masses here, we need to do linting instead
171
# of your average parse
172
sub readrules {
173
  my @files = <../rules/[0-9]*.cf>;
174
  my $file;
175
  %rulesfound = ();
176
  %langs = ();
177
  foreach $file (@files) {
178
    open (IN, "<$file");
179
    while (<IN>) {
180
      s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;
181
182
      # make all the foo-bar stuff foo_bar
183
      1 while s/^(\S+)-/\1_/g;
184
      1 while s/^(lang\s+\S+\s+\S+)-/\1_/g;
185
186
      my $lang = '';
187
      if (s/^lang\s+(\S+)\s+//) {
188
        $lang = $1; $langs{$1} = 1;
189
      }
190
191
      if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) {
192
        $rulesfound{$2} = 1;
193
        $rulefile{$2} ||= $file;
194
        $scorefile{$1} = $file;
195
        $score{$2} ||= 1.0;
196
        $tflags{$2} ||= '';
197
        $descfile{$2} ||= $file;       # a rule with no score or desc is OK
198
	$description{$2}->{$lang} = undef;
199
200
        if (/^body\s+\S+\s+eval:/) {
201
          # ignored
202
        } elsif (/^body\s+\S+\s+(.*)$/) {
203
          my $re = $1;
204
205
	  # If there's a ( in a rule where it should be (?:, flag it.
206
	  # but ignore [abc(] ...
207
          if ($re =~ /[^\\]\([^\?]/ && $re !~ /\[[^\]]*[^\\]\(/) { 
208
            print "warning: non-(?:...) capture in regexp in $file: $_\n";
209
          }
210
          if ($re =~ /\.[\*\+]/) { 
211
            print "warning: .* in regexp in $file: $_\n";
212
          }
213
          if ($re =~ /[^\\]\{(\d*),?(\d*?)\}/) {
214
            if ($1 > 120 || $2 > 120) {
215
              print "warning: long .{n} in regexp in $file: $_\n";
216
            }
217
          }
218
        }
219
220
      } elsif (/^describe\s+(\S+)\s+(.*?)\s*$/) {
221
        $rulesfound{$1} = 1;
222
        $descfile{concat_rule_lang ($1, $lang)} ||= $file;
223
        $descfile{$1} ||= $file;
224
	$description{$1}->{$lang} = $2;
225
      } elsif (/^tflags\s+(\S+)\s+(.+)$/) {
226
        $rulesfound{$1} = 1;
227
        $tflags{$1} = $2;
228
        $tflagsfile{concat_rule_lang ($1, $lang)} = $file;
229
        $tflagsfile{$1} = $file;
230
      } elsif (/^score\s+(\S+)\s+(.+)$/) {
231
        $rulesfound{$1} = 1;
232
        $scorefile{concat_rule_lang ($1, $lang)} = $file;
233
        $scorefile{$1} = $file;
234
        $score{$1} = $2;
235
      } elsif (/^(clear_report_template|clear_spamtrap_template|report|spamtrap|
236
                clear_terse_report_template|terse_report|
237
                required_score|ok_locales|ok_languages|test|lang|
238
                spamphrase|whitelist_from|require_version|
239
		clear_unsafe_report_template|unsafe_report|
240
		(?:bayes_)?auto_learn_threshold_nonspam|(?:bayes_)?auto_learn_threshold_spam|
241
		(?:bayes_)?auto_learn
242
                )/x) {
243
        next;
244
      } else {
245
        print "warning: unknown rule in $file: $_\n";
246
      }
247
    }
248
    close IN;
249
  }
250
  @langsfound = sort keys %langs;
251
  @rulesfound = sort keys %rulesfound;
252
}
253
254
sub lintrules {
255
  my %possible_renames = ();
256
257
  foreach my $rule (@rulesfound) {
258
    my $match = $rule;
259
    $match =~ s/_\d+[^_]+$//gs;    # trim e.g. "_20K"
260
    $match =~ s/[^A-Z]+//gs;    # trim numbers etc.
261
262
    if (defined ($rulefile{$rule}) && $possible_renames{$match} !~ / \Q$rule\E\b/) {
263
      $possible_renames{$match} .= " ".$rule;
264
    }
265
    $possible_rename_matches{$rule} = $match;
266
  }
267
268
  foreach my $lang ('', @langsfound) {
269
    foreach my $baserule (@rulesfound) {
270
      next if ( $baserule =~ /^__/ || $baserule =~ /^T_/ );
271
272
      my $rule = concat_rule_lang ($baserule, $lang);
273
      my $f = $descfile{$rule};
274
      my $warned = '';
275
276
      if (defined $f && !defined ($rulefile{$rule})
277
                && !defined ($rulefile{$baserule}))
278
      {
279
        print "warning: $baserule has description, but no rule: $f\n";
280
        $warned .= ' lamedesc';
281
      }
282
283
	# Check our convention for rule length
284
	if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && length $baserule > 22 ) {
285
	  print "warning: $baserule has a name longer than 22 chars: $f\n";
286
	}
287
 	# Check our convention for rule length
288
	if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && defined $description{$baserule}->{$lang} && length $description{$baserule}->{$lang} > 50 ) {
289
	  print "warning: $baserule has a description longer than 50 chars: $f\n";
290
	}
291
292
      # lang rule trumps normal rule
293
      $f = $rulefile{$rule} || $rulefile{$baserule};
294
      # if the rule exists, and the language/rule description doesn't exist ...
295
      if ( defined $f && !defined $description{$baserule}->{$lang} )
296
      {
297
        print "warning: $baserule exists, ",( $lang ne '' ? "lang $lang, " : "" ),"but has no description: $f\n";
298
        $warned .= ' lamedesc';
299
      }
300
301
302
      $f = $scorefile{$rule};
303
      if (defined $f && !defined ($rulefile{$rule})
304
                && !defined ($rulefile{$baserule}))
305
      {
306
        print "warning: $baserule has score, but no rule: $f\n";
307
        $warned .= ' lamescore';
308
      }
309
310
      my $r = $possible_rename_matches{$rule};
311
      if ($warned ne '' && defined $r) {
312
        my @matches = split (' ', $possible_renames{$r});
313
        if (scalar @matches != 0) {
314
          my $text = '';
315
316
          # now try and figure out "nearby" rules with no description/score
317
          foreach my $baser (@matches) {
318
            my $blang;
319
            if ($descfile{$rule} =~ /text_(\S\S)\./) {
320
              $blang = $1;
321
            }
322
            my $r = concat_rule_lang ($baser, $blang);
323
            #warn "$r $descfile{$r} $descfile{$baser}";
324
            next if ($warned =~ /lamedesc/ && (defined $descfile{$r}));
325
            next if ($warned =~ /lamescore/ && (defined $scorefile{$r}));
326
            $text .= " $baser";
327
          }
328
329
          if ($text ne '') {
330
            print "warning: (possible renamed rule? $text)\n";
331
          }
332
        }
333
      }
334
    }
335
  }
336
}
337
(-)mass-check (-108 / +226 lines)
Lines 16-112 Link Here
16
# limitations under the License.
16
# limitations under the License.
17
# </@LICENSE>
17
# </@LICENSE>
18
18
19
sub usage {
19
=head1 NAME
20
  die <<ENDOFUSAGE;
21
usage: mass-check [options] target ...
22
 
23
  -c=file       set configuration/rules directory
24
  -p=dir        set user-prefs directory
25
  -f=file       read list of targets from <file>
26
  -j=jobs       specify the number of processes to run simultaneously
27
  --net         turn on network checks!
28
  --mid         report Message-ID from each message
29
  --debug       report debugging information
30
  --progress    show progress updates during check
31
  --rewrite=OUT save rewritten message to OUT (default is /tmp/out)
32
  --showdots    print a dot for each scanned message
33
  --rules=RE    Only test rules matching the given regexp RE
34
  --restart=N   restart all of the children after processing N messages
35
  --deencap=RE  Extract SpamAssassin-encapsulated spam mails only if they
36
                were encapsulated by servers matching the regexp RE
37
                (default = extract all SpamAssassin-encapsulated mails)
38
 
39
  log options
40
  -o            write all logs to stdout
41
  --loghits     log the text hit for patterns (useful for debugging)
42
  --loguris	log the URIs found
43
  --hamlog=log  use <log> as ham log ('ham.log' is default)
44
  --spamlog=log use <log> as spam log ('spam.log' is default)
45
 
46
  message selection options
47
  -n            no date sorting or spam/ham interleaving
48
  --after=N     only test mails received after time_t N (negative values
49
                are an offset from current time, e.g. -86400 = last day)
50
                or after date as parsed by Time::ParseDate (e.g. '-6 months')
51
  --before=N    same as --after, except received times are before time_t N
52
  --all         don't skip big messages
53
  --head=N      only check first N ham and N spam (N messages if -n used)
54
  --tail=N      only check last N ham and N spam (N messages if -n used)
55
 
56
  simple target options (implies -o and no ham/spam classification)
57
  --dir         subsequent targets are directories
58
  --file        subsequent targets are files in RFC 822 format
59
  --mbox        subsequent targets are mbox files
60
  --mbx         subsequent targets are mbx files
61
 
62
  Just left over functions we should remove at some point:
63
  --bayes       report score from Bayesian classifier
64
 
65
  non-option arguments are used as target names (mail files and folders),
66
  the target format is: <class>:<format>:<location>
67
  <class>       is "spam" or "ham"
68
  <format>      is "dir", "file", "mbx", or "mbox"
69
  <location>    is a file or directory name.  globbing of ~ and * is supported
70
20
71
ENDOFUSAGE
21
mass-check - Generates SpamAssassin scores and results for large
72
}
22
amounts of mail
73
23
24
=head1 SYNOPSIS
25
26
 mass-check [options] class:format:location ...
27
 mass-check [options] {--dir | --file | --mbox} target ...
28
 mass-check [options] -f file
29
30
  Options:
31
    -f=file       read list of targets from <file>
32
    -j=jobs       specify the number of processes to run simultaneously
33
    --net         turn on network checks!
34
    --mid         report Message-ID from each message
35
    --debug       report debugging information
36
    --progress    show progress updates during check
37
    --rewrite=OUT save rewritten message to OUT (default is /tmp/out)
38
    --showdots    print a dot for each scanned message
39
    --rules=RE    Only test rules matching the given regexp RE
40
    --restart=N   restart all of the children after processing N messages
41
42
    SpamAssassin options
43
    -c=dir        set configuration/rules directory
44
    -p=file       set user preferences file (default: none)
45
    -s=dir        set site rules configuration directory
46
    -u=dir        set user-state directory
47
    --dist        assumes the script is being run from the masses/ dir of
48
                  the unpacked tarball, and makes appropriate guesses for
49
                  -p and -c
50
    --deencap=RE  Extract SpamAssassin-encapsulated spam mails only if they
51
                  were encapsulated by servers matching the regexp RE
52
                  (default = extract all SpamAssassin-encapsulated mails)
53
54
    log options
55
    -o            write all logs to stdout
56
    --loghits     log the text hit for patterns (useful for debugging)
57
    --loguris	  log the URIs found
58
    --log=file    log to <file> (masses.log is default)
59
60
    message selection options
61
    -n            no date sorting or spam/ham interleaving
62
    --after=N     only test mails received after time_t N (negative values
63
                  are an offset from current time, e.g. -86400 = last day)
64
                  or after date as parsed by Time::ParseDate (e.g. '-6 months')
65
    --before=N    same as --after, except received times are before time_t N
66
    --all         don't skip big messages
67
    --head=N      only check first N ham and N spam (N messages if -n used)
68
    --tail=N      only check last N ham and N spam (N messages if -n used)
69
70
    simple target options (implies -o and no ham/spam classification)
71
    --dir         subsequent targets are directories
72
    --file        subsequent targets are files in RFC 822 format
73
    --mbox        subsequent targets are mbox files
74
    --mbx         subsequent targets are mbx files
75
76
    Just left over functions we should remove at some point:
77
    --bayes       report score from Bayesian classifier
78
    --hamlog=log  use <log> as ham log ('ham.log' is default)
79
    --spamlog=log use <log> as spam log ('spam.log' is default)
80
81
=head1 DESCRIPTION
82
83
B<mass-check> is designed to assist with rule development and
84
generation of SpamAssassin scored. It reads in mail from the
85
location(s) specified on the command line (in the first form above),
86
given in the form I<class:format:location>, where I<class> is either
87
"spam" or "ham" (non-spam), I<format> is one of "dir" (Maildirs, MH,
88
etc), "file", "mbox" (mboxes can be gzipped) or "mbx".
89
90
B<mass-check> will analyze each message using SpamAssassin and
91
generate one-line of output per message, (by default to masses.log) in
92
the following format:
93
94
 {s|h} {s|h} score filename tests-hit
95
96
The first field is the message's class as given on the command line
97
(ham or spam). The second is the message's class as determined by
98
SpamAssassin. The third is the message's score, as determined by
99
SpamAssassin. The fourth field contains the message's filename; for
100
mboxes, this contains the filename and the byte offset from the
101
beginning of the file separated by a period. The last field contains a
102
list of all the tests the message hit separated by commas.
103
104
If you want to run this on the currently installed version of
105
SpamAssassin's rules for sitewide use, make sure your user_prefs file
106
contains no rules.
107
108
=head1 BUGS
109
110
Please report bugs to http://bugzilla.spamassassin.org/
111
112
=head1 SEE ALSO
113
114
L<hit-frequencies(1)>, L<logs-to-c(1)>, L<Mail::SpamAssassin::Masses(3)>,
115
L<perceptron(1)>
116
117
=cut
118
74
###########################################################################
119
###########################################################################
75
120
76
use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all $opt_bayes
121
use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all
77
	    $opt_debug $opt_format $opt_hamlog $opt_head $opt_loghits
122
	    $opt_bayes $opt_before $opt_debug $opt_dist $opt_format
78
	    $opt_mid $opt_mh $opt_ms $opt_net $opt_nosort $opt_progress
123
	    $opt_hamlog $opt_head $opt_log $opt_loghits $opt_mid
79
	    $opt_showdots $opt_spamlog $opt_tail $opt_rules $opt_restart
124
	    $opt_mh $opt_ms $opt_net $opt_nosort $opt_p $opt_progress
80
	    $opt_loguris $opt_after $opt_before $opt_rewrite $opt_deencap);
125
	    $opt_s $opt_showdots $opt_spamlog $opt_tail $opt_rules
126
	    $opt_restart $opt_loguris $opt_after $opt_rewrite $opt_u
127
	    $opt_deencap);
81
128
82
use FindBin;
129
use FindBin;
83
use lib "$FindBin::Bin/../lib";
130
use lib "$FindBin::Bin/../lib";
84
eval "use bytes";
131
eval "use bytes";
85
use Mail::SpamAssassin::ArchiveIterator;
132
use Mail::SpamAssassin::ArchiveIterator;
86
use Mail::SpamAssassin;
133
use Mail::SpamAssassin;
87
use Getopt::Long;
134
use Getopt::Long qw(:config bundling auto_help);
88
use POSIX qw(strftime);
135
use POSIX qw(strftime);
89
use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };
136
use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };
137
use strict; # Why wasn't this on?
90
use Config;
138
use Config;
91
139
92
# default settings
140
# default settings
93
$opt_c = "$FindBin::Bin/../rules";
141
94
$opt_p = "$FindBin::Bin/spamassassin";
95
$opt_j = 1;
142
$opt_j = 1;
96
$opt_net = 0;
143
$opt_net = 0;
97
$opt_hamlog = "ham.log";
144
$opt_log = "masses.log";
98
$opt_spamlog = "spam.log";
99
145
100
GetOptions("c=s", "p=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug",
146
GetOptions("c=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug",
101
	   "hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net",
147
	   "deencap=s", "dist!", "hamlog=s", "head=i", "log=s",
102
	   "progress", "rewrite:s", "showdots", "spamlog=s", "tail=i",
148
	   "loghits", "mh", "mid", "ms", "net", "p=s", "progress",
103
	   "rules=s", "restart=i", "after=s", "before=s", "loguris", "deencap=s",
149
	   "rewrite:s", "s=s", "showdots", "spamlog=s", "tail=i",
150
	   "rules=s", "restart=i", "u=s", "after=s", "loguris",
104
	   "dir" => sub { $opt_format = "dir"; },
151
	   "dir" => sub { $opt_format = "dir"; },
105
	   "file" => sub { $opt_format = "file"; },
152
	   "file" => sub {$opt_format = "file"; },
106
	   "mbox" => sub { $opt_format = "mbox"; },
153
	   "mbox" => sub { $opt_format = "mbox"; },
107
	   "mbx" => sub { $opt_format = "mbx"; },
154
	   "mbx" => sub { $opt_format = "mbx"; },
108
	   '<>' => \&target) or usage();
155
	   '<>' => \&target) or usage();
109
156
157
if ($opt_hamlog || $opt_spamlog) { # Old style logging
158
  $opt_hamlog ||= "ham.log";
159
  $opt_spamlog ||= "spam.log";
160
}
161
162
my @targets;
163
110
if ($opt_f) {
164
if ($opt_f) {
111
  open(F, $opt_f) || die $!;
165
  open(F, $opt_f) || die $!;
112
  push(@targets, map { chomp; $_ } <F>);
166
  push(@targets, map { chomp; $_ } <F>);
Lines 115-159 Link Here
115
169
116
if (scalar @targets == 0) { usage(); }
170
if (scalar @targets == 0) { usage(); }
117
171
118
#if ($opt_ms) {
172
# Auto-detect --dist option
119
#find_missed($opt_spamlog);
173
if (!defined $opt_dist) {
120
#}
174
  if (-f "$FindBin::Bin/../spamassassin.raw") {
121
#elsif ($opt_mh) {
175
    warn "Automatically using --dist. Assuming you are running from the unpacked tarball. Use --no-dist to override.";
122
#find_missed($opt_hamlog);
176
    $opt_dist = 1;
123
#}
177
  }
178
}
124
179
125
$spamtest = new Mail::SpamAssassin ({
180
my $local_rules_dir;
126
  'debug'              			=> $opt_debug,
127
  'rules_filename'     			=> $opt_c,
128
  'userprefs_filename' 			=> "$opt_p/user_prefs",
129
  'site_rules_filename'			=> "$opt_p/local.cf",
130
  'userstate_dir'     			=> "$opt_p",
131
  'save_pattern_hits'  			=> $opt_loghits,
132
  'dont_copy_prefs'   			=> 1,
133
  'local_tests_only'   			=> $opt_net ? 0 : 1,
134
  'only_these_rules'   			=> $opt_rules,
135
  'ignore_safety_expire_timeout'	=> 1,
136
  PREFIX				=> '',
137
  DEF_RULES_DIR        			=> $opt_c,
138
  LOCAL_RULES_DIR      			=> '',
139
});
140
181
182
if ($opt_dist) { # Set defaults
183
  $opt_c ||= "$FindBin::Bin/../rules";
184
  $opt_p ||= "$FindBin::Bin/mass-check.cf";
185
  $opt_u ||= "$FindBin::Bin/spamassassin";
186
  $opt_s ||= "$FindBin::Bin/spamassassin";
187
  $local_rules_dir = '';
188
}
189
else {
190
  if(!$opt_u) {
191
    # Assuming this is OK, since mass-check isnt supported on windows, is it?
192
    # Also, should there be some check to make sure that previous mass-check stuff isn't in there?
193
    # AFAICT, there isn't otherwise....
194
    if ( -d "${ENV{HOME}}/.spamassassin" ) {
195
      $opt_u = "${ENV{HOME}}/.spamassassin/mass-check";
196
      mkdir $opt_u, 0700 if (! -d $opt_u);
197
    }
198
  }
199
200
# Leave the rest to SA, we'll get it afterwards
201
202
}
203
204
my $spamtest = new Mail::SpamAssassin ({
205
				       'debug'              			=> $opt_debug,
206
				       'rules_filename'     			=> $opt_c,
207
				       'userprefs_filename' 			=> $opt_p,
208
				       'site_rules_filename'			=> $opt_s,
209
				       'userstate_dir'     			=> $opt_u,
210
				       'save_pattern_hits'  			=> $opt_loghits,
211
				       'dont_copy_prefs'   			=> 1,
212
				       'local_tests_only'   			=> $opt_net ? 0 : 1,
213
				       'only_these_rules'   			=> $opt_rules,
214
				       'ignore_safety_expire_timeout'	=> 1,
215
				       DEF_RULES_DIR        			=> $opt_c,
216
				       LOCAL_RULES_DIR      			=> $local_rules_dir,
217
				      });
218
141
$spamtest->compile_now(1);
219
$spamtest->compile_now(1);
142
$spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf");
220
if ($opt_dist) {
221
  $spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf");
222
}
143
223
144
my $who   = `id -un 2>/dev/null`;   chomp $who;
224
my $who   = `id -un 2>/dev/null`;   chomp $who;
145
my $where = `uname -n 2>/dev/null`; chomp $where;
225
my $where = `uname -n 2>/dev/null`; chomp $where;
146
my $when  = `date -u`;              chomp $when;
226
my $when  = `date -u`;              chomp $when;
147
my $revision = "unknown";
227
my $revision;
148
if (open(TESTING, "$opt_c/70_testing.cf")) {
228
149
  chomp($revision = <TESTING>);
229
if ($opt_dist) {
150
  $revision =~ s/.*\$Rev:\s*(\S+).*/$1/;
230
  my $rev = "unknown";
151
  close(TESTING);
231
  if (open(TESTING, "$opt_c/70_testing.cf")) {
232
    chomp($rev = <TESTING>);
233
    $rev =~ s/.*\$Rev:\s*(\S+).*/$1/;
234
    close(TESTING);
235
  }
236
  $revision = "SVN revision: $rev";
152
}
237
}
238
else {
239
  $revision = "Local";
240
}
241
153
my $log_header = "# mass-check results from $who\@$where, on $when\n" .
242
my $log_header = "# mass-check results from $who\@$where, on $when\n" .
154
		 "# M:SA version ".$spamtest->Version()."\n" .
243
		 "# M:SA version ".$spamtest->Version()."\n" .
155
		 "# SVN revision: $revision\n" .
244
		 "# $revision\n" .
156
		 "# Perl version: $] on $Config{archname}\n";
245
		 "# Perl version: $] on $Config{archname}\n";
246
247
if (!$opt_dist) {
248
  my @paths = ( $spamtest->{rules_filename}, $spamtest->{site_rules_filename}, $spamtest->{userprefs_filename} );
249
  $log_header .= "# Using configuration:\n";
250
  foreach my $file (@paths) {
251
    $log_header .=  "# $file\n";
252
  }
253
}
254
157
my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || `hostname` || 'localhost';
255
my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || `hostname` || 'localhost';
158
chomp $host;
256
chomp $host;
159
257
Lines 222-228 Link Here
222
    autoflush STDOUT 1;
320
    autoflush STDOUT 1;
223
    print STDOUT $log_header;
321
    print STDOUT $log_header;
224
  }
322
  }
225
  else {
323
  elsif ($opt_hamlog || $opt_spamlog) {
226
    open(HAM, "> $opt_hamlog");
324
    open(HAM, "> $opt_hamlog");
227
    open(SPAM, "> $opt_spamlog");
325
    open(SPAM, "> $opt_spamlog");
228
    autoflush HAM 1;
326
    autoflush HAM 1;
Lines 230-235 Link Here
230
    print HAM $log_header;
328
    print HAM $log_header;
231
    print SPAM $log_header;
329
    print SPAM $log_header;
232
  }
330
  }
331
  else {
332
    open(OUT, "> $opt_log");
333
    autoflush OUT 1;
334
    print OUT $log_header;
335
  }
233
  $init_results = 1;
336
  $init_results = 1;
234
}
337
}
235
338
Lines 239-263 Link Here
239
  # don't open results files until we get here to avoid overwriting files
342
  # don't open results files until we get here to avoid overwriting files
240
  &init_results if !$init_results;
343
  &init_results if !$init_results;
241
344
242
  if ($class eq "s") {
345
  if ($opt_o) {
243
    if ($opt_o) { print STDOUT $result; } else { print SPAM $result; }
346
    print STDOUT $result;
244
    $spam_count++;
245
  }
347
  }
246
  elsif ($class eq "h") {
348
  elsif ($opt_spamlog || $opt_hamlog) {
247
    if ($opt_o) { print STDOUT $result; } else { print HAM $result; }
349
    if ($class eq "s") {
248
    $ham_count++;
350
      print SPAM $result;
351
    } else {
352
      print HAM $result;
353
    }
249
  }
354
  }
355
  else {
356
    print OUT $result;
357
  }
250
358
251
  $total_count++;
359
  $total_count++;
252
#warn ">> result: $total_count $class $time\n";
360
#warn ">> result: $total_count $class $time\n";
253
361
254
  if ($opt_progress) {
362
  if ($opt_progress) {
363
    if ($class eq "s") {
364
      $spam_count++;
365
    }
366
    else {
367
      $ham_count++;
368
    }
255
    progress($time);
369
    progress($time);
256
  }
370
  }
257
}
371
}
258
372
259
sub wanted {
373
sub wanted {
260
  my (undef, $id, $time, $dataref) = @_;
374
  my ($class, $id, $time, $dataref) = @_;
261
  my $out;
375
  my $out;
262
376
263
  my $ma = $spamtest->parse($dataref, 1);
377
  my $ma = $spamtest->parse($dataref, 1);
Lines 308-325 Link Here
308
    push(@extra, "mid=$mid");
422
    push(@extra, "mid=$mid");
309
  }
423
  }
310
424
311
  my $yorn;
425
  my $result;
312
  my $score;
426
  my $score;
313
  my $tests;
427
  my $tests;
314
  my $extra;
428
  my $extra;
315
429
316
  if ($opt_loguris) {
430
  if ($opt_loguris) {
317
    $yorn = '.';
431
    $result = '.';
318
    $score = 0;
432
    $score = 0;
319
    $tests = join(" ", sort @uris);
433
    $tests = join(" ", sort @uris);
320
    $extra = '';
434
    $extra = '';
321
  } else {
435
  } else {
322
    $yorn = $status->is_spam() ? 'Y' : '.';
436
    if ($status->is_spam()) {
437
      $result = "s";
438
    } else {
439
      $result = "h";
440
    }
323
    $score = $status->get_score();
441
    $score = $status->get_score();
324
    $tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit())));
442
    $tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit())));
325
    $extra = join(",", @extra);
443
    $extra = join(",", @extra);
Lines 333-339 Link Here
333
451
334
  $id =~ s/\s/_/g;
452
  $id =~ s/\s/_/g;
335
453
336
  $out .= sprintf("%s %2d %s %s %s\n", $yorn, $score, $id, $tests, $extra);
454
  $out .= sprintf("%s %s %05.2f %s %s %s\n", $class, $result, $score, $id, $tests, $extra);
337
455
338
  if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) {
456
  if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) {
339
    $out .= logkilled($ma, $id, "possible virus");
457
    $out .= logkilled($ma, $id, "possible virus");
(-)mk-baseline-results (-2 / +2 lines)
Lines 10-16 Link Here
10
echo "Classification success on test corpora, at default threshold:"
10
echo "Classification success on test corpora, at default threshold:"
11
echo
11
echo
12
12
13
./logs-to-c --spam=spam-validate.log --nonspam=nonspam-validate.log --threshold 5 --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
13
./fp-fn-statistics --logfile=masses-validate.log --threshold 5 --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
14
14
15
echo
15
echo
16
echo "Results on test corpora at various alternative thresholds:"
16
echo "Results on test corpora at various alternative thresholds:"
Lines 18-24 Link Here
18
18
19
# list a wide range of thresholds, so that we can make graphs later ;)
19
# list a wide range of thresholds, so that we can make graphs later ;)
20
for thresh in -4 -3 -2 -1 0 1 2 3 4 4.5 5.5 6 6.5 7 8 9 10 12 15 17 20 ; do
20
for thresh in -4 -3 -2 -1 0 1 2 3 4 4.5 5.5 6 6.5 7 8 9 10 12 15 17 20 ; do
21
  ./logs-to-c --spam=spam-validate.log --nonspam=nonspam-validate.log --threshold $thresh --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
21
  ./logs-to-c --logfile=masses-validate.log --threshold $thresh --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
22
  echo
22
  echo
23
done
23
done
24
24
(-)README (-10 / +10 lines)
Lines 52-62 Link Here
52
  This script is used to perform "mass checks" of a set of mailboxes, Cyrus
52
  This script is used to perform "mass checks" of a set of mailboxes, Cyrus
53
  folders, and/or MH mail spools.  It generates summary lines like this:
53
  folders, and/or MH mail spools.  It generates summary lines like this:
54
54
55
  Y  7 /home/jm/Mail/Sapm/1382 SUBJ_ALL_CAPS,SUPERLONG_LINE,SUBJ_FULL_OF_8BITS
55
  s s 07.22 /home/jm/Mail/Sapm/1382 SUBJ_ALL_CAPS,SUPERLONG_LINE,SUBJ_FULL_OF_8BITS
56
56
57
  or for mailboxes,
57
  or for mailboxes,
58
58
59
  .  1 /path/to/mbox:<5.1.0.14.2.20011004073932.05f4fd28@localhost> TRACKER_ID,BALANCE_FOR_LONG
59
  h h 01.32 /path/to/mbox:<5.1.0.14.2.20011004073932.05f4fd28@localhost> TRACKER_ID,BALANCE_FOR_LONG
60
60
61
  listing the path to the message or its message ID, its score, and the tests
61
  listing the path to the message or its message ID, its score, and the tests
62
  that triggered on that mail.
62
  that triggered on that mail.
Lines 65-87 Link Here
65
  get good hits with few false positives, etc., and re-score the tests to
65
  get good hits with few false positives, etc., and re-score the tests to
66
  optimise the ratio.
66
  optimise the ratio.
67
67
68
  This script relies on the spamassassin distribution directory living in "..".
68
  If given the --dist option, this script relies on the spamassassin
69
  distribution directory living in "..". If this script is not in the
70
  distribution directory, it will generate logs based on the site-wide
71
  rules, as well as personal rules.
69
72
70
73
71
logs-to-c :
74
logs-to-c :
72
75
73
  Takes the "spam.log" and "nonspam.log" files and converts them into C
76
  Takes the "masses.log" file and converts them into C source files
74
  source files and simplified data files for use by the C score optimization
77
  and simplified data files for use by the C score optimization
75
  algorithm.  (Called by "make" when you build the perceptron, so generally
78
  algorithm.  (Called by "make" when you build the perceptron, so
76
  you won't need to run it yourself.)
79
  generally you won't need to run it yourself.)
77
80
78
79
hit-frequencies :
81
hit-frequencies :
80
82
81
  Analyses the log files and computes how often each test hits, overall,
83
  Analyses the log files and computes how often each test hits, overall,
82
  for spam mails and for non-spam.
84
  for spam mails and for non-spam.
83
85
84
85
mk-baseline-results :
86
mk-baseline-results :
86
87
87
  Compute results for the baseline scores (read from ../rules/*).  If you
88
  Compute results for the baseline scores (read from ../rules/*).  If you
Lines 91-97 Link Here
91
  It will output statistics on the current ruleset to ../rules/STATISTICS.txt,
92
  It will output statistics on the current ruleset to ../rules/STATISTICS.txt,
92
  suitable for a release build of SpamAssassin.
93
  suitable for a release build of SpamAssassin.
93
94
94
95
perceptron.c :
95
perceptron.c :
96
96
97
  Perceptron learner by Henry Stern.  See "README.perceptron" for details.
97
  Perceptron learner by Henry Stern.  See "README.perceptron" for details.
(-)fp-fn-statistics (-2 / +145 lines)
Lines 1-3 Link Here
1
#!/bin/sh
1
#!/usr/bin/perl -w
2
#
3
# <@LICENSE>
4
# Copyright 2004 Apache Software Foundation
5
#
6
# Licensed under the Apache License, Version 2.0 (the "License");
7
# you may not use this file except in compliance with the License.
8
# You may obtain a copy of the License at
9
#
10
#     http://www.apache.org/licenses/LICENSE-2.0
11
#
12
# Unless required by applicable law or agreed to in writing, software
13
# distributed under the License is distributed on an "AS IS" BASIS,
14
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
# See the License for the specific language governing permissions and
16
# limitations under the License.
17
# </@LICENSE>
18
 
19
 
20
use FindBin;
21
use lib "$FindBin::Bin/../lib";
22
use Mail::SpamAssassin::Masses;
23
use Getopt::Long;
24
use strict;
25
use warnings;
2
26
3
exec ./logs-to-c --count $*
27
use vars qw{$opt_c $opt_l $opt_s $opt_h $opt_t $opt_lambda};
28
29
GetOptions("c|cffile=s@" => \$opt_c, "l|logfile=s" => \$opt_l,
30
"s|scoreset=i" => \$opt_s, "h|help" => \$opt_h, "t|threshold" =>
31
\$opt_t, "lambda" => \$opt_lambda);
32
33
34
# Need to add rule for scores file 
35
sub usage {
36
  die "logs-to-c [-c rules dir] [-l log file] [-s SS] [-t]
37
38
    -c,--cffile   p   use p as rules directory
39
    -s,--scoreset SS  use scoreset SS
40
    -l,--log      log read logs from log instead of from masses.log
41
    -t,--threshold n  use a threshold of n (default: 5)
42
    --lambda=n        use a lambda value of n (default: 5)
43
";
44
}
45
46
47
usage() if $opt_h;
48
49
$opt_c ||= "$FindBin::Bin/../rules";
50
$opt_t = (defined($opt_t) ? $opt_t : 5);
51
$opt_s ||= 0; #|
52
$opt_l ||= "masses.log";
53
$opt_lambda ||= 5;
54
55
print "$opt_c used for config";
56
57
my $nybias = 10;
58
59
60
my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
61
                                               scoreset => $opt_s, # ,,
62
                                               logfile => $opt_l});
63
64
$masses->readlogs();
65
66
my $logs = $masses->get_logs();
67
68
my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore) = (0,0,0,0,0,0,0,0);
69
70
my $num_spam = $masses->get_num_spam();
71
my $num_ham = $masses->get_num_ham();
72
my $num_logs = $num_spam + $num_ham;
73
74
my $count = 0;
75
76
my $score;
77
78
foreach my $log (@$logs) {
79
80
  $score = 0;
81
  foreach my $test (@{$log->{tests_hit}}) {
82
83
    next if ($test->{issubrule});
84
    next if (!$test->{score});
85
86
    $score += $test->{score};
87
88
  }
89
90
  if ($score >= $opt_t) {
91
    if ($log->{isspam}) {
92
      $ga_yy++;
93
      $yyscore += $score;
94
    }
95
    else {
96
      $ga_ny++;
97
      $nyscore += $score;
98
    }
99
  } else {
100
    if ($log->{isspam}) {
101
      $ga_yn++;
102
      $ynscore += $score;
103
    }
104
    else {
105
      $ga_nn++;
106
      $nnscore += $score;
107
    }
108
  }
109
}
110
111
$nybias = $nybias * ($num_spam / $num_ham);
112
113
my $fprate = ($ga_ny / $num_logs) * 100.0;
114
my $fnrate = ($ga_yn / $num_logs) * 100.0;
115
116
printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_t);
117
printf "# Correctly non-spam: %6d  %4.2f%%  (%4.2f%% of non-spam corpus)\n", $ga_nn,
118
  ($ga_nn /  $num_logs) * 100.0, ($ga_nn /  $num_ham) * 100.0;
119
printf "# Correctly spam:     %6d  %4.2f%%  (%4.2f%% of spam corpus)\n" , $ga_yy,
120
  ($ga_yy /  $num_logs) * 100.0, ($ga_yy /  $num_spam) * 100.0;
121
printf "# False positives:    %6d  %4.2f%%  (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny,
122
  $fprate, ($ga_ny /  $num_ham) * 100.0, $nyscore*$nybias;
123
printf "# False negatives:    %6d  %4.2f%%  (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn,
124
  $fnrate, ($ga_yn /  $num_spam) * 100.0, $ynscore;
125
126
# convert to the TCR metrics used in the published lit
127
my $nspamspam = $ga_yy;
128
my $nspamlegit = $ga_yn;
129
my $nlegitspam = $ga_ny;
130
my $nlegitlegit = $ga_yn;
131
my $nlegit = $num_ham;
132
my $nspam = $num_spam;
133
134
my $werr = ($opt_lambda * $nlegitspam + $nspamlegit)
135
  / ($opt_lambda * $nlegit + $nspam);
136
137
my $werr_base = $nspam
138
  / ($opt_lambda * $nlegit + $nspam);
139
140
$werr ||= 0.000001;     # avoid / by 0
141
my $tcr = $werr_base / $werr;
142
143
my $sr = ($nspamspam / $nspam) * 100.0;
144
my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
145
printf "# TCR: %3.6f  SpamRecall: %3.3f%%  SpamPrec: %3.3f%%  FP: %3.2f%%  FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate;
146
(-)extract-message-from-mbox (-1 / +1 lines)
Lines 68-74 Link Here
68
68
69
sub masscheck {
69
sub masscheck {
70
  while (<STDIN>) {
70
  while (<STDIN>) {
71
    my $mail = (split(/\s+/, $_))[2];
71
    my $mail = (split(/\s+/, $_))[3];
72
    $mail =~ tr/_/ /;
72
    $mail =~ tr/_/ /;
73
    if ($mail =~ /^(.*)\.(\d+)$/) {
73
    if ($mail =~ /^(.*)\.(\d+)$/) {
74
      extract($1, $2);
74
      extract($1, $2);
(-)logs-to-c (-346 / +212 lines)
Lines 16-272 Link Here
16
# limitations under the License.
16
# limitations under the License.
17
# </@LICENSE>
17
# </@LICENSE>
18
18
19
use Getopt::Long;
19
=head1 NAME
20
use vars qw($opt_cffile $opt_count $opt_lambda $opt_threshold
21
		$opt_spam $opt_nonspam);
22
20
23
GetOptions("cffile=s", "count", "lambda=f", "threshold=f", "spam=s", "nonspam=s", "scoreset=i");
21
logs-to-c - Convert a mass-check log into perceptron format
24
my $argcffile = $opt_cffile;
25
22
26
my $justcount = 0;
23
=head1 SYNOPSIS
27
if ($opt_count) { $justcount = 1; }
28
24
29
my $threshold = 5;
25
logs-to-c [options]
30
if (defined $opt_threshold) { $threshold = $opt_threshold; }
31
26
32
$opt_spam ||= 'spam.log';
27
 Options:
33
$opt_nonspam ||= 'ham.log';
28
    -c,--cffile=path	  Use path as the rules directory
34
$opt_scoreset = 0 if ( !defined $opt_scoreset );
29
    -s,--scoreset=n	  Use scoreset n
30
    -l,--logfile=file	  Read in file instead of masses.log
31
    -o,--outputdir        Put output in the specified dir (default tmp/)
35
32
36
my $nybias = 10;
33
=head1 DESCRIPTION
37
34
38
# lambda value for TCR equation, indicating the "cost" of recovering
35
B<logs-to-c> will read the mass-check log F<masses.log> or as
39
# from an FP.  The values are: 1 = tagged only, 9 = mailed back to
36
specified by the B<--logfile> option, and convert it into the format
40
# sender asking for token (TMDA style), 999 = deleted outright.
37
needed by the perceptron. This is a format that is simple for the
41
# We (SpamAssassin) use a default of 5, representing "moved to
38
perceptron to parse, but is not very readable to humans.
42
# infrequently-read folder".
43
39
44
my $lambda = 5;
40
By default, output will be put in the directory ./tmp/ unless another
45
if ($opt_lambda) { $lambda = $opt_lambda; }
41
directory is specified by the B<--outputdir> option. (Note: at the
42
current time, this must be /tmp/ in order for the perceptron to
43
compile properly.)
46
44
47
my %is_spam = ();
45
=head1 BUGS
48
my %tests_hit = ();
49
my %mutable_tests = ();
50
46
51
use vars qw(%rules %allrules);
47
Please report bugs to http://bugzilla.spamassassin.org/
52
48
53
readscores();
49
=head1 SEE ALSO
54
50
55
print "Reading per-message hit stat logs and scores...\n";
51
L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
56
my ($num_tests, $num_spam, $num_nonspam);
57
my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore);
58
52
59
readlogs();
53
=cut
60
read_ranges();
61
54
62
if ($justcount) {
55
use FindBin;
63
  $nybias = $nybias*($num_spam / $num_nonspam);
56
use lib "$FindBin::Bin/../lib";
64
  evaluate();
57
use Mail::SpamAssassin::Masses;
65
} else {
58
use Getopt::Long qw(:config bundling auto_help);
66
  print "Writing logs and current scores as C code...\n";
59
use strict;
67
  writescores_c();
60
use warnings;
61
62
use vars qw{$opt_c $opt_l $opt_s $opt_o};
63
64
GetOptions("c|cffile=s@" => \$opt_c,
65
	   "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
66
	   "l|logfile=s" => \$opt_l,
67
	   "o|output=s" => \$opt_o);
68
69
70
$opt_o ||= "./tmp/";
71
if (!-d $opt_o) {
72
  mkdir $opt_o, 0777 or die "Can't mkdir $opt_o";
68
}
73
}
69
exit 0;
70
74
75
if (!$opt_c || !scalar(@$opt_c)) {
76
    # Try to read this in from the log, if possible
77
    open IN, $opt_l or die "Can't open $opt_l: $!";
78
    my $files = 0; # are we in the files section?
79
    while(<IN>) {
80
	if (!$files) {
81
	    if (/^\# SVN revision:/) {
82
		$opt_c = [ "$FindBin::Bin/../rules" ];
83
		last;
84
	    } elsif (/^\# Using configuration:$/) {
85
		$files = 1;
86
	    }
87
	} elsif (/^\#\s+(.*)\s*$/) {
88
	    push (@$opt_c, $1);
89
	} else {
90
	    # All done!
91
	    last;
92
	}
93
    }
71
94
72
sub readlogs {
95
    foreach my $file (@$opt_c) {
73
  my $count = 0;
96
	die "Can't read $file" unless -r $file;
74
  $num_spam = $num_nonspam = 0;
97
    }
98
}
75
99
76
  if ($justcount) {
100
# ignore rules that are subrules -- we don't generate scores for them...
77
    $ga_yy = $ga_ny = $ga_yn = $ga_nn = 0;
78
    $yyscore = $ynscore = $nyscore = $nnscore = 0.0;
79
  }
80
101
81
  foreach my $file ($opt_spam, $opt_nonspam) {
102
# Note: this will cause a difference over the old logs-to-c since rank
82
    open (IN, "<$file");
103
# is dependent on the frequencies of all rules, not just non-subrules
83
104
84
    while (<IN>) {
105
my $greprules = sub { return 0 if $_[1]->{issubrule}; return 1; };
85
      next if /^\#/;
86
      next if /^$/;
87
      if($_ !~ /^.\s+([-\d]+)\s+\S+\s*/) { warn "bad line: $_"; next; }
88
      my $hits = $1;
89
#my $foo = $_;
90
      $_ = $'; s/(?:bayes|time)=\S+//; s/,,+/,/g; s/^\s+//; s/\s+$//;
91
106
92
      my $score = 0;
107
$opt_s ||= 0; # |
93
      my @tests = ();
94
      foreach my $tst (split (/,/, $_)) {
95
	next if ($tst eq '');
96
	if (!defined $scores{$tst}) {
97
          #warn "unknown test in $file, ignored: $tst\n";
98
	  next;
99
	}
100
108
101
	# Make sure to skip any subrules!
109
my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
102
	next if ( $allrules{$tst}->{issubrule} );
110
					       scoreset => $opt_s, # ,,
111
					       logfile => $opt_l,
112
                                               greprules => $greprules });
103
113
104
        if ($justcount) {
114
$masses->readlogs();
105
          $score += $scores{$tst};
115
$masses->do_score_ranges();
106
        } else {
107
          push (@tests, $tst);
108
        }
109
      }
110
116
111
      if (!$justcount) { 
117
my $rules = $masses->get_rules_array();
112
        $tests_hit{$count} = \@tests;
118
my $logs = $masses->get_logs();
113
      }
114
119
115
      if ($file eq $opt_spam) {
120
my @index_to_rule;
116
	$num_spam++;
121
my $num_spam = $masses->get_num_spam();
117
        if ($justcount) {
122
my $num_ham = $masses->get_num_ham();
118
          if ($score >= $threshold) {
119
            $ga_yy++; $yyscore += $score;
120
          } else {
121
            $ga_yn++; $ynscore += $score;
122
          }
123
        } else {
124
          $is_spam{$count} = 1;
125
        }
126
      } else {
127
	$num_nonspam++;
128
        if ($justcount) {
129
          if ($score >= $threshold) {
130
#print "$score -- $foo";
131
            $ga_ny++; $nyscore += $score;
132
          } else {
133
            $ga_nn++; $nnscore += $score;
134
          }
135
        } else {
136
          $is_spam{$count} = 0;
137
        }
138
      }
139
      $count++;
140
    }
141
    close IN;
142
  }
143
  $num_tests = $count;
144
}
145
123
124
# This is misleading -- num_tests is really num_msgs
125
my $num_tests = $num_spam + $num_ham;
146
126
147
sub readscores {
148
  if (!defined $argcffile) { $argcffile = "../rules"; }
149
  print "Reading scores from \"$argcffile\"...\n";
150
  system ("./parse-rules-for-masses -d \"$argcffile\" -s $opt_scoreset") and die;
151
  require "./tmp/rules.pl";
152
  %allrules = %rules;           # ensure it stays global
153
}
154
127
128
# Write logs and scores as C code
129
writescores_c();
130
writetests_c();
131
132
155
sub writescores_c {
133
sub writescores_c {
156
  my $output = '';
134
157
  my $size = 0;
158
  my $mutable = 0;
135
  my $mutable = 0;
159
  my $i;
136
  my $output = '';
137
  my $count = 0;
138
  my $score = 0;
160
139
161
    # jm: now, score-ranges-from-freqs has tflags to work from, so
140
  foreach my $rule (sort {($b->{ismutable} <=> $a->{ismutable}) ||
162
    # it will always list all mutable tests.
141
			  ($a->{name} cmp $b->{name}) } @$rules) {
163
142
164
  @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) ||
143
    $score = $rule->{score};
165
			  ($mutable_tests{$b} <=> $mutable_tests{$a}) ||
166
			   ($a cmp $b)} (keys %scores);
167
  my $max_hits_per_msg = 0;
168
  for ($file = 0; $file < $num_tests; $file++) {
169
    my(@hits) =
170
     grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (@{$tests_hit{$file}});
171
    if ((scalar(@hits)+1) > $max_hits_per_msg) {
172
      $max_hits_per_msg = scalar(@hits)+1;
173
    }
174
  }
175
144
176
  for ($i = 0; $i <= $#index_to_rule; $i++) {
145
    # ignored rules (i.e. no scores)
177
    my $name = $index_to_rule[$i];
146
    next unless $score;
178
    $rule_to_index{$name} = $i;
179
147
180
    if ($ignored_rule{$name}) { next; }
148
    # also ignore rules with score range 0
149
    next if (!$rule->{range_lo} && !$rule->{range_hi});
181
150
182
    if ($mutable_tests{$name} == 0) {
151
    # Set an index
183
      $range_lo{$name} = $range_hi{$name} = $scores{$name};
152
    $rule->{index} = $count;
184
    } else {
153
    $index_to_rule[$count] = $rule; # add the reference to the array
154
155
    if ($rule->{ismutable}) {
185
      $mutable++;
156
      $mutable++;
186
      if ($range_lo{$name} > $range_hi{$name}) {
157
      if ($score > $rule->{range_hi}) {
187
	($range_lo{$name},$range_hi{$name}) =
158
	$score = $rule->{range_hi} - 0.001;
188
	 ($range_hi{$name},$range_lo{$name});
159
      } elsif ($score < $rule->{range_lo}) {
160
	$score = $rule->{range_lo} + 0.001;
189
      }
161
      }
190
      #$range_lo{$name} ||= 0.1;
191
      #$range_hi{$name} ||= 1.5;
192
    }
162
    }
163
    # These should all be set properly if not mutable
164
    # score = range_lo = range_hi
165
    else {
166
      warn "hi != lo for " . $rule->{name} . "!" if $rule->{range_lo} != $rule->{range_hi};
167
      $score = $rule->{range_hi} = $rule->{range_lo};
168
    }
193
169
194
    $output .= ".".$i."\n".
170
    $output .= "." . $count . "\n" .
195
                "n".$name."\n".
171
         "n" . $rule->{name} . "\n" .
196
                "b".$scores{$name}."\n".
172
	 "b" . $score . "\n" .
197
                "m".$mutable_tests{$name}."\n".
173
	 "m" . $rule->{ismutable} . "\n" .
198
                "l".$range_lo{$name}."\n".
174
	 "l" . $rule->{range_lo} . "\n" .
199
                "h".$range_hi{$name}."\n";
175
	 "h" . $rule->{range_hi} . "\n";
200
    $size++;
176
177
    $count++;
178
201
  }
179
  }
202
180
181
  # Output this
203
182
204
  open (DAT, ">tmp/scores.data");
183
  open (DAT, ">$opt_o/scores.data");
205
  print DAT "N$size\n", "M$mutable\n", # informational only
184
  print DAT "N$count\n", "M$mutable\n"; # informational
206
   $output;
185
  print DAT $output;
207
  close DAT;
186
  close DAT;
208
187
209
  open (OUT, ">tmp/scores.h");
188
  open (OUT, ">$opt_o/scores.h");
210
  print OUT "
189
  print OUT <<EOF;
211
#include <stdio.h>
190
#include <stdio.h>
212
#include <string.h>
191
#include <string.h>
213
#include <stdlib.h>
192
#include <stdlib.h>
214
193
 
215
int num_scores = $size;
194
int num_scores = $count;
216
int num_mutable = $mutable;
195
int num_mutable = $mutable;
217
unsigned char is_mutable[$size];
196
unsigned char is_mutable[$count];
218
double range_lo[$size];
197
double range_lo[$count];
219
double range_hi[$size];
198
double range_hi[$count];
220
double bestscores[$size];
199
double bestscores[$count];
221
char *score_names[$size];
200
char *score_names[$count];
222
double tmp_scores[$size][2];
201
double tmp_scores[$count][2];
223
unsigned char ny_hit[$mutable];
202
unsigned char ny_hit[$mutable];
224
unsigned char yn_hit[$mutable];
203
unsigned char yn_hit[$mutable];
225
204
 
226
double lookup[$mutable];
205
double lookup[$mutable];
227
206
 
228
/* readscores() is defined in tests.h */
207
/* readscores() is defined in tests.h */
208
EOF
229
209
230
";
231
  close OUT;
210
  close OUT;
232
211
233
  writetests_c($max_hits_per_msg); # make sure $rule_to_index is around
234
}
212
}
235
213
214
236
sub writetests_c {
215
sub writetests_c {
237
  my $max_hits_per_msg = $_[0];
238
216
239
  my(%uniq_files) = ();
217
  my $max_hits_per_msg = 0;
240
  my(%count_keys) = ();
218
  my @goodtests;
241
  my(%file_key) = ();
219
  my %uniq_logs;
220
  my $uniq_key;
242
221
243
  my $file;
222
  my $i = 0;
244
223
245
  for ($file = 0; $file < $num_tests; $file++)
224
  # This will "compress" the logs so that one log entry can have a
246
  {
225
  # "count" of n indicating it reprents n similar messages
247
    my $uniq_key = $is_spam{$file} . " ";
248
226
249
    my(@good_tests) =
227
  foreach my $log (@$logs) {
250
     grep {length($_) && (! $ignored_rule{$_}) &&
251
	    (defined($rule_to_index{$_}))} (@{ $tests_hit{$file} });
252
228
253
    @good_tests = sort {$a <=> $b} (map {$rule_to_index{$_}} (@good_tests));
229
    (@goodtests) = grep {exists($_->{index})} (@{$log->{tests_hit}});
230
    @goodtests = sort {$a <=> $b} map {$_->{index}} @goodtests;
254
231
255
    $uniq_key .= join(" ",@good_tests);
232
    if($max_hits_per_msg < scalar(@goodtests)) {
233
      $max_hits_per_msg = scalar(@goodtests);
234
    }
256
235
257
    if (exists($count_keys{$uniq_key})) {
236
    $uniq_key = $log->{isspam} ? "s" : "";
258
      $count_keys{$uniq_key}++;
237
    $uniq_key .= join(" ", @goodtests);
238
239
240
    # The %count_keys hash's entries will be the log info for each unique log
241
    # $log->{count} is increased to indicate similar logs
242
243
    if (exists($uniq_logs{$uniq_key})) {
244
      $uniq_logs{$uniq_key}->{count}++;
259
    } else {
245
    } else {
260
      $count_keys{$uniq_key} = 1;
246
      $uniq_logs{$uniq_key} = $log;
261
      $file_key{$file} = $uniq_key;
247
      $uniq_logs{$uniq_key}->{count} = 1;
262
      $uniq_files{$file} = scalar(keys(%count_keys)) - 1;
263
    }
248
    }
249
264
  }
250
  }
265
251
266
  my $num_nondup = scalar(keys(%uniq_files));
252
  my $num_nondup = scalar(keys %uniq_logs);
267
253
268
  open (TOP, ">tmp/tests.h");
254
  open TOP, ">$opt_o/tests.h";
269
  print TOP "
255
  print TOP <<EOF;
270
#include <stdio.h>
256
#include <stdio.h>
271
#include <string.h>
257
#include <string.h>
272
#include <stdlib.h>
258
#include <stdlib.h>
Lines 274-280 Link Here
274
int num_tests = $num_tests;
260
int num_tests = $num_tests;
275
int num_nondup = $num_nondup;
261
int num_nondup = $num_nondup;
276
int num_spam = $num_spam;
262
int num_spam = $num_spam;
277
int num_nonspam = $num_nonspam;
263
int num_nonspam = $num_ham;
278
int max_hits_per_msg = $max_hits_per_msg;
264
int max_hits_per_msg = $max_hits_per_msg;
279
unsigned char num_tests_hit[$num_nondup];
265
unsigned char num_tests_hit[$num_nondup];
280
unsigned char is_spam[$num_nondup];
266
unsigned char is_spam[$num_nondup];
Lines 282-477 Link Here
282
double scores[$num_nondup];
268
double scores[$num_nondup];
283
double tmp_total[$num_nondup];
269
double tmp_total[$num_nondup];
284
int tests_count[$num_nondup];
270
int tests_count[$num_nondup];
271
EOF
285
272
286
";
273
287
  $_ = join ('', <DATA>);
274
  print TOP join('', <DATA>);
288
  print TOP $_;
289
  close TOP;
275
  close TOP;
290
276
291
  open (DAT, ">tmp/tests.data");
292
277
293
  foreach $file (sort {$a <=> $b} (keys %uniq_files)) {
278
  open (DAT, ">$opt_o/tests.data");
294
    print DAT ".".$uniq_files{$file}."\n";
295
279
296
    my $out = '';
280
  my $out;
297
    $out .= "s".$is_spam{$file}."\n";
281
  my $base_score;
282
  my $num_tests_hit;
298
283
299
    my $base_score = 0;
284
  $i = 0;
300
    my $num_tests_hit = 0;
285
  foreach my $log (values %uniq_logs) {
301
    foreach my $test (@{$tests_hit{$file}}) {
286
    $out = '';
302
      if ($test eq '') { next; }
287
    $base_score = $num_tests_hit = 0;
303
288
304
      if ($ignored_rule{$test}) {
289
    print DAT "." . $i . "\n";
305
        warn "ignored rule $test got a hit in $file!\n";
290
306
        next;
291
    $out .= "s" . ( ($log->{isspam})? 1 : 0 ) . "\n";
292
293
    foreach my $test (@{$log->{tests_hit}}) {
294
      if (!$test->{score}) {
295
	# Don't really know why this happens, but the old logs-to-c
296
	#did it too
297
298
	warn "ignored rule " . $test->{name} . " got a hit!";
299
	next;
307
      }
300
      }
308
301
309
      if (!defined $rule_to_index{$test}) {
302
      if (!$test->{range_lo} && !$test->{range_hi}) {
310
	warn "test with no C index: $test\n";
303
	# We ignored this rule
311
	next;
304
	next;
312
      }
305
      }
313
306
314
      if ($mutable_tests{$test}) {
307
      # debugging...
315
      $num_tests_hit++;
308
      if (!defined $test->{index}) {
316
      $out .= "t".$rule_to_index{$test}."\n";
309
	warn "test with no index";
317
318
      if ($num_tests_hit >= $max_hits_per_msg) {
319
	die "Need to increase \$max_hits_per_msg";
320
      }
310
      }
321
      } else {
322
	$base_score += $scores{$test};
323
      }
324
    }
325
311
326
    $out .= "b" . $base_score . "\n"; # score to add in for non-mutable tests
312
      if ($test->{ismutable}) {
327
    $out .= "c" . $count_keys{$file_key{$file}} . "\n";
313
	$num_tests_hit++;
314
	$out .= "t".$test->{index}."\n";
328
315
329
    print DAT "n".$num_tests_hit."\n".$out;
316
	if ($num_tests_hit >= $max_hits_per_msg) {
330
  }
317
	  die "\$max_hits_per_msg not big enough!";
331
  close DAT;
318
	}
332
}
333
319
334
sub read_ranges {
320
      }
335
  if (!-f 'tmp/ranges.data') {
321
      else {
336
    system ("make tmp/ranges.data");
322
	$base_score += $test->{score};
337
  }
323
      }
338
324
339
  # read ranges, and mutableness, from ranges.data.
340
  open (IN, "<tmp/ranges.data")
341
  	or die "need to run score-ranges-from-freqs first!";
342
343
  my $count = 0;
344
  while (<IN>) {
345
    /^(\S+) (\S+) (\d+) (\S+)$/ or next;
346
    my $t = $4;
347
    $range_lo{$t} = $1+0;
348
    $range_hi{$t} = $2+0;
349
    my $mut = $3+0;
350
351
    if ($allrules{$t}->{issubrule}) {
352
      $ignored_rule{$t} = 1;
353
      $mutable_tests{$t} = 0;
354
      next;
355
    }
325
    }
356
    if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) {
357
      #warn "ignored rule: score and range == 0: $t\n";
358
      $ignored_rule{$t} = 1;
359
      $mutable_tests{$t} = 0;
360
      next;
361
    }
362
326
363
    $ignored_rule{$t} = 0;
327
    $out .= "b" . $base_score . "\n"; # score to add for non-mutable tests
364
    $index_to_rule[$count] = $t;
328
    $out .= "c" . $log->{count} . "\n"; # number of identical logs
365
    $count++;
366
329
367
    if (!$mut) {
330
    print DAT "n" . $num_tests_hit . "\n" . $out;
368
      $mutable_tests{$t} = 0;
369
    } elsif ($range_lo{$t} == $range_hi{$t}) {
370
      $mutable_tests{$t} = 0;
371
    } elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
372
      $mutable_tests{$t} = 0;
373
    } else {
374
      $mutable_tests{$t} = 1;
375
    }
376
    unless ($mutable_tests{$t} || $scores{$t}) {
377
      $ignored_rule{$t} = 1;
378
    }
379
  }
380
  close IN;
381
331
382
  # catch up on the ones missed; seems to be userconf or 0-hitters mostly.
332
    $i++;
383
  foreach my $t (sort keys %allrules) {
384
    next if (exists($range_lo{$t}));
385
    if ($allrules{$t}->{issubrule}) {
386
      $ignored_rule{$t} = 1;
387
      $mutable_tests{$t} = 0;
388
      next;
389
    }
390
    $ignored_rule{$t} = 0;
391
    unless (exists($mutable_tests{$t}) &&
392
	    ($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) {
393
      $mutable_tests{$t} = 0;
394
    }
395
    unless ($mutable_tests{$t} || $scores{$t}) {
396
      $ignored_rule{$t} = 1;
397
    }
398
    $index_to_rule[$count] = $t;
399
    $count++;
400
  }
333
  }
401
  foreach my $t (keys %range_lo) {
402
    next if ($ignored_rule{$t});
403
    if ($mutable_tests{$t}) {
404
      if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
405
	$scores{$t} = -1;
406
      } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
407
	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
408
	$scores{$t} = -0.01;
409
      }
410
      if ($scores{$t} >= $range_hi{$t}) {
411
	$scores{$t} = $range_hi{$t} - 0.001;
412
      } elsif ($scores{$t} <= $range_lo{$t}) {
413
	$scores{$t} = $range_lo{$t} + 0.001;
414
      }
415
    } else {
416
      if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
417
	next;
418
      } elsif ($range_lo{$t} == $range_hi{$t}) {
419
	$scores{$t} = $range_lo{$t};
420
	next;
421
      }
422
      if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
423
	$scores{$t} = -1;
424
      } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
425
	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
426
	$scores{$t} = -0.01;
427
      }
428
      if ($scores{$t} > $range_hi{$t}) {
429
	$scores{$t} = $range_hi{$t};
430
      } elsif ($scores{$t} < $range_lo{$t}) {
431
	$scores{$t} = $range_lo{$t};
432
      }
433
    }
434
  }
435
}
436
334
437
sub evaluate {
335
  close DAT;
438
   my $fprate = ($ga_ny / $num_tests) * 100.0;
439
   my $fnrate = ($ga_yn / $num_tests) * 100.0;
440
336
441
   printf ("\n# SUMMARY for threshold %3.1f:\n", $threshold);
442
   printf "# Correctly non-spam: %6d  %4.2f%%  (%4.2f%% of non-spam corpus)\n", $ga_nn,
443
       ($ga_nn /  $num_tests) * 100.0, ($ga_nn /  $num_nonspam) * 100.0;
444
   printf "# Correctly spam:     %6d  %4.2f%%  (%4.2f%% of spam corpus)\n" , $ga_yy,
445
       ($ga_yy /  $num_tests) * 100.0, ($ga_yy /  $num_spam) * 100.0;
446
   printf "# False positives:    %6d  %4.2f%%  (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny,
447
       $fprate, ($ga_ny /  $num_nonspam) * 100.0, $nyscore*$nybias;
448
   printf "# False negatives:    %6d  %4.2f%%  (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn,
449
       $fnrate, ($ga_yn /  $num_spam) * 100.0, $ynscore;
450
337
451
  # convert to the TCR metrics used in the published lit
452
  my $nspamspam = $ga_yy;
453
  my $nspamlegit = $ga_yn;
454
  my $nlegitspam = $ga_ny;
455
  my $nlegitlegit = $ga_yn;
456
  my $nlegit = $num_nonspam;
457
  my $nspam = $num_spam;
458
459
  my $werr = ($lambda * $nlegitspam + $nspamlegit)
460
                  / ($lambda * $nlegit + $nspam);
461
462
  my $werr_base = $nspam
463
                  / ($lambda * $nlegit + $nspam);
464
465
  $werr ||= 0.000001;     # avoid / by 0
466
  my $tcr = $werr_base / $werr;
467
468
  my $sr = ($nspamspam / $nspam) * 100.0;
469
  my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
470
  printf "# TCR: %3.6f  SpamRecall: %3.3f%%  SpamPrec: %3.3f%%  FP: %3.2f%%  FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate;
471
}
338
}
472
339
473
__DATA__
474
340
341
__DATA__
475
void loadtests (void) {
342
void loadtests (void) {
476
  FILE *fin = fopen ("tmp/tests.data", "r");
343
  FILE *fin = fopen ("tmp/tests.data", "r");
477
  char buf[256];
344
  char buf[256];
Lines 557-560 Link Here
557
424
558
  printf ("Read scores for %d tests.\n", num_scores);
425
  printf ("Read scores for %d tests.\n", num_scores);
559
}
426
}
560
(-)score-ranges-from-freqs (-251 lines)
Lines 1-251 Link Here
1
#!/usr/bin/perl -w
2
#
3
# <@LICENSE>
4
# Copyright 2004 Apache Software Foundation
5
# 
6
# Licensed under the Apache License, Version 2.0 (the "License");
7
# you may not use this file except in compliance with the License.
8
# You may obtain a copy of the License at
9
# 
10
#     http://www.apache.org/licenses/LICENSE-2.0
11
# 
12
# Unless required by applicable law or agreed to in writing, software
13
# distributed under the License is distributed on an "AS IS" BASIS,
14
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
# See the License for the specific language governing permissions and
16
# limitations under the License.
17
# </@LICENSE>
18
19
# (rough) graphic demo of this algorithm:
20
# 0.0  = -limit [......] 0 ........ limit
21
# 0.25 = -limit ..[..... 0 .]...... limit
22
# 0.5  = -limit ....[... 0 ...].... limit
23
# 0.75 = -limit ......[. 0 .....].. limit
24
# 1.0  = -limit ........ 0 [......] limit
25
my $sliding_window_limits = 4.8; # limits = [-$range, +$range]
26
my $sliding_window_size =   5.5; # scores have this range within limits
27
28
# 0.0  = -limit [......] 0 ........ limit
29
# 0.25 = -limit ....[... 0 ]....... limit
30
# 0.5  = -limit ......[. 0 .]...... limit (note: tighter)
31
# 0.75 = -limit .......[ 0 ...].... limit
32
# 1.0  = -limit ........ 0 [......] limit
33
my $shrinking_window_lower_base =   0.00; 
34
my $shrinking_window_lower_range =  1.00; # *ratio, added to above
35
my $shrinking_window_size_base =    1.00;
36
my $shrinking_window_size_range =   1.00; # *ratio, added to above
37
38
my $use_sliding_window = 0;
39
40
my $argcffile = shift @ARGV;
41
my $scoreset = shift @ARGV;
42
$scoreset = 0 if ( !defined $scoreset );
43
44
if (defined ($argcffile) && $argcffile eq '-test') {
45
  # use this to debug the ranking -> score-range mapping:
46
  for $rat (0.0, 0.25, 0.5, 0.75, 1.0) {
47
    my ($lo, $hi); if ($use_sliding_window) {
48
      ($lo, $hi) = sliding_window_ratio_to_range($rat);
49
    } else {
50
      ($lo, $hi) = shrinking_window_ratio_to_range($rat);
51
    }
52
    warn "test: $rat => [ $lo $hi ]\n";
53
  } exit;
54
}
55
56
my %freq_spam = ();
57
my %freq_nonspam = ();
58
59
my $num_spam;
60
my $num_nonspam;
61
my $num_total;
62
63
my %mutable_tests = ();
64
my %ranking = ();
65
my %soratio = ();
66
my %is_nice = ();
67
68
if (!defined $argcffile) { $argcffile = "../rules"; }
69
system ("./parse-rules-for-masses -d \"$argcffile\" -s $scoreset") and die;
70
if (-e "tmp/rules.pl") {
71
  # Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem
72
  require "./tmp/rules.pl";
73
}
74
else {
75
  die "parse-rules-for-masses had no error but no tmp/rules.pl!?!";
76
}
77
78
while (<>) {
79
  /^\s*([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+\S+\s+(.+)\s*$/ or next;
80
81
  my $overall = $1+0;
82
  my $spam = $2+0;
83
  my $nonspam = $3+0;
84
  my $soratio = $4+0;
85
  my $ranking = $5+0;
86
  my $test = $6;
87
88
  if ($test eq '(all messages)') {
89
    $num_spam = $spam;
90
    $num_nonspam = $nonspam;
91
    $num_total = $spam+$nonspam;
92
    next;
93
  }
94
  next if ($test eq '(all messages as %)');
95
96
  if (!defined ($rules{$test})) {
97
    warn "rule $test no longer exists; ignoring\n";
98
    next;
99
  }
100
101
  $freq{$test} = $overall;
102
  $freq_spam{$test} = $spam;
103
  $freq_nonspam{$test} = $nonspam;
104
105
  my $tflags = $rules{$test}->{tflags}; $tflags ||= '';
106
  if ($tflags =~ /\buserconf\b/ ||
107
      ( ($scoreset % 2) == 0 && $tflags =~ /\bnet\b/ )) {
108
    $mutable_tests{$test} = 0;
109
  } else {
110
    $mutable_tests{$test} = 1;
111
  }
112
  if ($tflags =~ m/\bnice\b/i) {
113
    $is_nice{$test} = 1;
114
  } else {
115
    $is_nice{$test} = 0;
116
  }
117
118
  if ($overall < 0.01) {        # less than 0.01% of messages were hit
119
    $mutable_tests{$test} = 0;
120
    $soratio{$test} = 0.5;
121
    $ranking{$test} = 0.0;
122
    $rules{$test}->{score} = 0; # tvd - disable these rules automagically
123
124
  } else {
125
    $soratio{$test} = $soratio;
126
    $ranking{$test} = $ranking;
127
  }
128
}
129
130
if ( ! mkdir "tmp", 0755 ) {
131
  warn "Couldn't create tmp directory!: $!\n";
132
}
133
134
open (OUT, ">tmp/ranges.data");
135
foreach my $test (sort { $ranking{$b} <=> $ranking{$a} } keys %freq) {
136
  if (!defined ($rules{$test})) {
137
    warn "no rule $test";
138
    print OUT ("0 0 0 $test\n");
139
    next;
140
  }
141
142
  my $overall = $freq{$test};
143
  my $spam = $freq_spam{$test};
144
  my $nonspam = $freq_nonspam{$test};
145
  my $soratio = $soratio{$test};
146
  my $ranking = $ranking{$test};
147
  my $mutable = $mutable_tests{$test};
148
149
  if (!$mutable || $rules{$test}->{score} == 0) { # didn't look for score 0 - tvd
150
    printf OUT ("%3.3f %3.3f 0 $test\n",
151
                         $rules{$test}->{score},
152
                         $rules{$test}->{score});
153
    next;
154
  }
155
156
  # 0.0 = best nice, 1.0 = best nonnice
157
  if ($is_nice{$test}) {
158
    $ranking = .5 - ($ranking / 2);
159
  } else {
160
    $ranking = .5 + ($ranking / 2);
161
  }
162
163
  my ($lo, $hi);
164
  if ($use_sliding_window) {
165
    ($lo, $hi) = sliding_window_ratio_to_range($ranking);
166
  } else {
167
    ($lo, $hi) = shrinking_window_ratio_to_range($ranking);
168
  }
169
170
  # tvd
171
  my $tflags = $rules{$test}->{tflags}; $tflags ||= '';
172
  if ( $is_nice{$test} && ( $ranking < .5 ) ) { # proper nice rule
173
    if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score # -5.4
174
      $lo *=1.8;
175
    }
176
    elsif ($soratio <= 0.05 && $nonspam > 0.5) { # let good rules be larger if they want to, -4.5
177
      $lo *= 1.5;
178
    }
179
180
    $hi =	($soratio == 0) ? $lo :
181
    		($soratio <= 0.005 ) ? $lo/1.1 :
182
    		($soratio <= 0.010 && $nonspam > 0.2) ? $lo/2.0 :
183
		($soratio <= 0.025 && $nonspam > 1.5) ? $lo/10.0 :
184
		0;
185
186
    if ( $soratio >= 0.35 ) { # auto-disable bad rules
187
      ($lo,$hi) = (0,0);
188
    }
189
  }
190
  elsif ( !$is_nice{$test} && ( $ranking >= .5 ) ) { # proper spam rule
191
    if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score
192
      $hi *=1.8;
193
    }
194
    elsif ( $soratio >= 0.99 && $spam > 1.0 ) {
195
      $hi *= 1.5; # let good rules be larger if they want to
196
    }
197
198
    $lo =	($soratio == 1) ? $hi:
199
    		($soratio >= 0.995 ) ? $hi/4.0 :
200
    		($soratio >= 0.990 && $spam > 1.0) ? $hi/8.0 :
201
		($soratio >= 0.900 && $spam > 10.0) ? $hi/24.0 :
202
		0;
203
204
    if ( $soratio <= 0.65 ) { # auto-disable bad rules
205
      ($lo,$hi) = (0,0);
206
    }
207
  }
208
  else { # rule that has bad nice setting
209
    ($lo,$hi) = (0,0);
210
  }
211
  $mutable = 0 if ( $hi == $lo );
212
213
  printf OUT ("%3.1f %3.1f $mutable $test\n", $lo, $hi);
214
}
215
close OUT;
216
exit;
217
218
sub sliding_window_ratio_to_range {
219
  my $ratio = shift;
220
  my $lo = -$sliding_window_limits + ($sliding_window_size * $ratio);
221
  my $hi = +$sliding_window_limits - ($sliding_window_size * (1-$ratio));
222
  if ($lo > $hi) { # ???
223
    ($lo,$hi) = ($hi,$lo);
224
  }
225
  ($lo, $hi);
226
}
227
228
sub shrinking_window_ratio_to_range {
229
  my $ratio = shift;
230
  my $is_nice = 0;
231
  my $adjusted = ($ratio -.5) * 2;      # adj [0,1] to [-1,1]
232
  if ($adjusted < 0) { $is_nice = 1; $adjusted = -$adjusted; }
233
234
#$adjusted /= 1.5 if ( $ratio < 0.95 && $ratio > 0.15 ); # tvd
235
236
  my $lower = $shrinking_window_lower_base 
237
                        + ($shrinking_window_lower_range * $adjusted);
238
  my $range = $shrinking_window_size_base 
239
                        + ($shrinking_window_size_range * $adjusted);
240
  my $lo = $lower;
241
  my $hi = $lower + $range;
242
  if ($is_nice) {
243
    my $tmp = $hi; $hi = -$lo; $lo = -$tmp;
244
  }
245
  if ($lo > $hi) { # ???
246
    ($lo,$hi) = ($hi,$lo);
247
  }
248
249
  ($lo, $hi);
250
}
251
(-)find-extremes (-153 / +182 lines)
Lines 17-54 Link Here
17
# limitations under the License.
17
# limitations under the License.
18
# </@LICENSE>
18
# </@LICENSE>
19
19
20
use Getopt::Std;
21
getopts("l:L:h");
22
20
21
use FindBin;
22
use lib "$FindBin::Bin/../lib";
23
use Mail::SpamAssassin::Masses;
24
use Getopt::Long qw(:config bundling auto_help);
25
use Pod::Usage;
26
use strict;
27
use warnings;
28
23
use vars qw {
29
use vars qw {
24
  $opt_h $opt_l $opt_L
30
$opt_c $opt_s $opt_l $opt_L $opt_inclang
25
};
31
};
26
32
27
sub usage {
33
GetOptions("c|cffile=s@" => \$opt_c,
28
  die "find-extremes [-l LC] [-L LC] [spam log] [nonspam log]
34
           "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
35
           "l|logfile=s" => \$opt_l,
36
           "L|language=s" => \$opt_L,
37
           "include-language=s" => \$opt_inclang);
29
38
30
    -l LC  also print language specific rules for lang code LC (or 'all')
31
    -L LC  only print language specific rules for lang code LC (or 'all')
32
39
33
    options -l and -L are mutually exclusive.
34
40
35
    if either the spam or and nonspam logs are unspecified, the defaults
41
my $lower = 1;
36
    are \"spam.log\" and \"nonspam.log\" in the cwd.
42
#$threshold = 5;
43
my $higher = 9;
44
my $min_expected = 2; # Should not be set to more than 5 or less than 2
37
45
38
";
46
47
=head1 NAME
48
 
49
find-extremes - Determine which rules are most likely to cause false positives/negatives.
50
 
51
=head1 SYNOPSIS
52
 
53
hit-frequencies [options]
54
 
55
 Options:
56
    -c,--cffile=path      Use path as the rules directory
57
    -s,--scoreset=n       Use scoreset n
58
    -l,--logfile=file     Read in file instead of masses.log
59
    -L,--language=lc      Only print language specific tests for specified lang code (try 'all')
60
    --include-language=lc Also print language specific tests for specified lang code (try 'all')
61
 
62
=head1 DESCRIPTION
63
64
B<hit-frequencies> will read the mass-check log F<masses.log> or the
65
log given by the B<--logfile> option. By default, B<hit-frequencies>
66
will assume the proper values for B<--cffile> based on the header of
67
the masses.log. The output will include the following columns:
68
69
=over 4
70
71
=item RULE
72
73
=item CHISQUARE
74
75
=item RATIO_FALSEPOS
76
77
=item OVER_FALSEPOS
78
79
=item FREQ_OVER
80
81
=back
82
83
=head1 BUGS
84
85
This script may or may not work as designed - it probably needs some
86
tweaking, and I probably introduced a bug into it while re-writing for
87
the new Masses stuff. 
88
89
=head1 NOTES
90
91
This script is poorly documented. Patches welcome.
92
93
=cut
94
95
96
$opt_s = 0 unless defined $opt_s;
97
98
my $ok_lang = lc ( $opt_inclang || $opt_L || '');
99
$ok_lang = '.' if ($ok_lang eq 'all');
100
101
my $greprules = sub {
102
  my ($name, $rule) = @_;
103
104
  return 0 if (($opt_L && !$rule->{lang}) ||
105
           ($rule->{lang} &&
106
            (!$ok_lang || $rule->{lang} !~ /^$ok_lang/i))); # Wrong language
107
108
  return 0 if ($rule->{tflags} =~ /\bnet\b/);
109
110
  return 1;
111
112
};
113
114
$opt_l ||= "masses.log";
115
116
if (!$opt_c || !scalar(@$opt_c)) {
117
    # Try to read this in from the log, if possible
118
    open (IN, $opt_l) or die "Can't open $opt_l: $!";
119
    my $files = 0; # are we in the files section?
120
    while(<IN>) {
121
        if (!$files) {
122
            if (/^\# SVN revision:/) {
123
                $opt_c = [ "$FindBin::Bin/../rules" ];
124
                last;
125
            } elsif (/^\# Using configuration:$/) {
126
                $files = 1;
127
            }
128
        } elsif (/^\#\s+(.*)\s*$/) {
129
            push (@$opt_c, $1);
130
        } else {
131
            # All done!
132
            last;
133
        }
134
    }
135
136
    foreach my $file (@$opt_c) {
137
        die "Can't read $file" unless -r $file;
138
    }
39
}
139
}
40
140
41
usage() if($opt_h || ($opt_l && $opt_L));
141
my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
142
                                               scoreset => $opt_s,
143
                                               greprules => $greprules,
144
                                               logfile => $opt_l,
145
                                               nologs => 1});
42
146
43
$lower = 1;
147
$masses->readrules();
44
#$threshold = 5;
148
$masses->readlogs();
45
$higher = 9;
46
$min_expected = 2; # Should not be set to more than 5 or less than 2
47
149
48
my %freq_spam = ();	# how often non-nice found in spam
150
my $rules = $masses->get_rules_hash();
151
my $logs = $masses->get_logs();
152
153
my $num_spam = $masses->get_num_spam();
154
my $num_ham = $masses->get_num_ham();
155
49
my %freq_over_higher_falsepos = (); # how often non-nice found in ones over
156
my %freq_over_higher_falsepos = (); # how often non-nice found in ones over
50
                                    # higher threshold that are false positives
157
                                    # higher threshold that are false positives
51
my %freq_nonspam = ();	# how often nice found in nonspam
52
my %freq_under_lower_falseneg = (); # how often nice found in ones under
158
my %freq_under_lower_falseneg = (); # how often nice found in ones under
53
                                    # lower threshold that are false negatives
159
                                    # lower threshold that are false negatives
54
160
Lines 59-101 Link Here
59
my %ratio_expected_falsepos = (); # ratio version of above
165
my %ratio_expected_falsepos = (); # ratio version of above
60
my %ratio_expected_falseneg = (); # ditto
166
my %ratio_expected_falseneg = (); # ditto
61
167
62
my $num_spam = 0;
63
my $num_nonspam = 0;
64
my $num_over_higher_falsepos = 0;
168
my $num_over_higher_falsepos = 0;
65
my $num_under_lower_falseneg = 0;
169
my $num_under_lower_falseneg = 0;
66
my $ok_lang = '';
67
170
68
readscores();
171
my %chisquare = ( );
172
my %prob = ( );
69
173
70
$ok_lang = lc ($opt_l || $opt_L || '');
71
if ($ok_lang eq 'all') { $ok_lang = '.'; }
72
174
73
foreach my $key (keys %rules) {
175
foreach my $key (keys %$rules) {
74
176
75
  if ( ($opt_L && !$rules{$key}->{lang}) ||
177
  if ($rules->{$key}->{tflags} !~ /\buserconf\b/) {
76
       ($rules{$key}->{lang} &&
178
    if ($rules->{$key}->{tflags} =~ m/nice/) {
77
         (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i)
78
     ) ) {
79
    delete $rules{$key} ; next;
80
  }
81
82
  if ($rules{$key}->{tflags} =~ m/net/) {
83
    delete $rules{$key};
84
    next;
85
  }
86
  if ($rules{$key}->{tflags} !~ m/userconf/) {
87
    if ($rules{$key}->{tflags} =~ m/nice/) {
88
      $freq_nonspam{$key} = 0;
89
      $freq_under_lower_falseneg{$key} = 0;
179
      $freq_under_lower_falseneg{$key} = 0;
90
    } else {
180
    } else {
91
      $freq_spam{$key} = 0;
92
      $freq_over_higher_falsepos{$key} = 0;
181
      $freq_over_higher_falsepos{$key} = 0;
93
    }
182
    }
94
  }
183
  }
184
95
}
185
}
96
186
97
readlogs();
187
foreach my $log (@$logs) {
98
188
189
  if($log->{isspam}) {
190
    # Also need to count plus_hits
191
    my $plus_hits = 0;
192
    foreach my $test (@{$log->{tests_hit}}) {
193
      $plus_hits += $test->{score} if ($test->{score} > 0);
194
    }
195
196
    if(($log->{score} <= $lower) && $plus_hits && $plus_hits >= $lower) {
197
      $num_under_lower_falseneg++;
198
      foreach my $test (@{$log->{tests_hit}}) {
199
	$num_under_lower_falseneg++;
200
	$freq_under_lower_falseneg{$test->{name}}++ if exists $freq_under_lower_falseneg{$test->{name}};
201
      }
202
    }
203
  }
204
  else {
205
    if($log->{score} > $higher) {
206
      $num_over_higher_falsepos++;
207
      foreach my $test (@{$log->{tests_hit}}) {
208
	$num_over_higher_falsepos++;
209
	$freq_over_higher_falsepos{$test->{name}}++ if exists $freq_over_higher_falsepos{$test->{name}};
210
      }
211
    }
212
  }
213
214
}
215
99
unless (($num_over_higher_falsepos >= $min_expected)
216
unless (($num_over_higher_falsepos >= $min_expected)
100
	&& ($num_under_lower_falseneg >= $min_expected)) {
217
	&& ($num_under_lower_falseneg >= $min_expected)) {
101
  die "Insufficient extremes in dataset (" . $num_over_higher_falsepos .
218
  die "Insufficient extremes in dataset (" . $num_over_higher_falsepos .
Lines 119-130 Link Here
119
}
236
}
120
237
121
my $ratio_falsepos = $num_over_higher_falsepos/$num_spam;
238
my $ratio_falsepos = $num_over_higher_falsepos/$num_spam;
122
my $ratio_falseneg = $num_under_lower_falseneg/$num_nonspam;
239
my $ratio_falseneg = $num_under_lower_falseneg/$num_ham;
123
240
124
my $skipped_non_nice = 0;
241
my $skipped_non_nice = 0;
125
242
126
foreach $rule (keys %freq_spam) {
243
# non-nice rules
127
  my $expected = $freq_spam{$rule}*$ratio_falsepos;
244
foreach my $rule (keys %freq_over_higher_falsepos) {
245
  my $expected = $rules->{$rule}->{freq_spam}*$ratio_falsepos;
128
  if ($expected <= $min_expected) {
246
  if ($expected <= $min_expected) {
129
    $skipped_non_nice++;
247
    $skipped_non_nice++;
130
    next;
248
    next;
Lines 136-142 Link Here
136
   $freq_over_higher_falsepos{$rule}/$expected;
254
   $freq_over_higher_falsepos{$rule}/$expected;
137
  ($chisquare{$rule},$prob{$rule}) =
255
  ($chisquare{$rule},$prob{$rule}) =
138
   chisquare($num_spam,$num_over_higher_falsepos,
256
   chisquare($num_spam,$num_over_higher_falsepos,
139
	     $freq_spam{$rule},$freq_over_higher_falsepos{$rule});
257
	     $rules->{$rule}->{freq_spam},$freq_over_higher_falsepos{$rule});
140
  if ($freq_over_higher_falsepos{$rule} < $expected) {
258
  if ($freq_over_higher_falsepos{$rule} < $expected) {
141
    $chisquare{$rule} *= -1;
259
    $chisquare{$rule} *= -1;
142
  }
260
  }
Lines 146-153 Link Here
146
264
147
my $skipped_nice = 0;
265
my $skipped_nice = 0;
148
266
149
foreach $rule (keys %freq_nonspam) {
267
# nice rules
150
  my $expected = $freq_nonspam{$rule}*$ratio_falseneg;
268
foreach my $rule (keys %freq_under_lower_falseneg) {
269
  my $expected = $rules->{$rule}->{freq_ham}*$ratio_falseneg;
151
  if ($expected <= $min_expected) {
270
  if ($expected <= $min_expected) {
152
    $skipped_nice++;
271
    $skipped_nice++;
153
    next;
272
    next;
Lines 158-165 Link Here
158
  $ratio_expected_falseneg{$rule} =
277
  $ratio_expected_falseneg{$rule} =
159
   $freq_under_lower_falseneg{$rule}/$expected;
278
   $freq_under_lower_falseneg{$rule}/$expected;
160
  ($chisquare{$rule},$prob{$rule}) =
279
  ($chisquare{$rule},$prob{$rule}) =
161
   chisquare($num_nonspam,$num_under_lower_falseneg,
280
   chisquare($num_ham,$num_under_lower_falseneg,
162
	     $freq_nonspam{$rule},$freq_under_lower_falseneg{$rule});
281
	     $rules->{$rule}->{freq_ham},$freq_under_lower_falseneg{$rule});
163
  if ($freq_under_lower_falseneg{$rule} < $expected) {
282
  if ($freq_under_lower_falseneg{$rule} < $expected) {
164
    $chisquare{$rule} *= -1;
283
    $chisquare{$rule} *= -1;
165
  }
284
  }
Lines 167-174 Link Here
167
286
168
warn "Skipped nice: $skipped_nice\n";
287
warn "Skipped nice: $skipped_nice\n";
169
288
170
@rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos);
289
# The rest is copied verbatim from before - its complicated and not
290
# commented and should work unchanged except for the freq_spam and
291
# freq_ham stuff and fixing some use strict stuff
171
292
293
my @rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos);
294
172
if (scalar(@rules_falsepos)) {
295
if (scalar(@rules_falsepos)) {
173
  print "RULE\t\tCHISQUARE\tRATIO_FALSEPOS\tOVER_FALSEPOS\tFREQ_OVER ($num_over_higher_falsepos)\n";
296
  print "RULE\t\tCHISQUARE\tRATIO_FALSEPOS\tOVER_FALSEPOS\tFREQ_OVER ($num_over_higher_falsepos)\n";
174
  my(@rules_falsepos_bad) =
297
  my(@rules_falsepos_bad) =
Lines 183-189 Link Here
183
	   $over_expected_falsepos{$a}) ||
306
	   $over_expected_falsepos{$a}) ||
184
	    ($freq_over_higher_falsepos{$b} <=>
307
	    ($freq_over_higher_falsepos{$b} <=>
185
	     $freq_over_higher_falsepos{$a})} (@rules_falsepos_bad);
308
	     $freq_over_higher_falsepos{$a})} (@rules_falsepos_bad);
186
    foreach $rule (@rules_falsepos_bad) {
309
    foreach my $rule (@rules_falsepos_bad) {
187
      print $rule . "\t" . $prob{$rule} . "\t" .
310
      print $rule . "\t" . $prob{$rule} . "\t" .
188
       $ratio_expected_falsepos{$rule} . "\t" .
311
       $ratio_expected_falsepos{$rule} . "\t" .
189
	$over_expected_falsepos{$rule} . "\t" .
312
	$over_expected_falsepos{$rule} . "\t" .
Lines 199-207 Link Here
199
       ($chisquare{$a} <=> $chisquare{$b}) ||
322
       ($chisquare{$a} <=> $chisquare{$b}) ||
200
	($ratio_expected_falsepos{$a} <=>
323
	($ratio_expected_falsepos{$a} <=>
201
	 $ratio_expected_falsepos{$b}) ||
324
	 $ratio_expected_falsepos{$b}) ||
202
	  ($freq_spam{$b} <=>
325
	  ($rules->{$b}->{freq_spam} <=>
203
	   $freq_spam{$a})} (@rules_falsepos_good);
326
	   $rules->{$a}->{freq_spam})} (@rules_falsepos_good);
204
    foreach $rule (@rules_falsepos_good) {
327
    foreach my $rule (@rules_falsepos_good) {
205
      print $rule . "\t" . $prob{$rule} . "\t" .
328
      print $rule . "\t" . $prob{$rule} . "\t" .
206
       $ratio_expected_falsepos{$rule} . "\t" .
329
       $ratio_expected_falsepos{$rule} . "\t" .
207
	$over_expected_falsepos{$rule} . "\t" .
330
	$over_expected_falsepos{$rule} . "\t" .
Lines 212-218 Link Here
212
  warn "No over-falsepos to print\n";
335
  warn "No over-falsepos to print\n";
213
}
336
}
214
337
215
@rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg);
338
my @rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg);
216
339
217
if (scalar(@rules_falseneg)) {
340
if (scalar(@rules_falseneg)) {
218
  print "RULE\t\tCHISQUARE\tRATIO_FALSENEG\tOVER_FALSENEG\tFREQ_UNDER ($num_under_lower_falseneg)\n";
341
  print "RULE\t\tCHISQUARE\tRATIO_FALSENEG\tOVER_FALSENEG\tFREQ_UNDER ($num_under_lower_falseneg)\n";
Lines 228-234 Link Here
228
	   $over_expected_falseneg{$a}) ||
351
	   $over_expected_falseneg{$a}) ||
229
	    ($freq_under_lower_falseneg{$b} <=>
352
	    ($freq_under_lower_falseneg{$b} <=>
230
	     $freq_under_lower_falseneg{$a})} (@rules_falseneg_bad);
353
	     $freq_under_lower_falseneg{$a})} (@rules_falseneg_bad);
231
    foreach $rule (@rules_falseneg_bad) {
354
    foreach my $rule (@rules_falseneg_bad) {
232
      print $rule . "\t" . $prob{$rule} . "\t" .
355
      print $rule . "\t" . $prob{$rule} . "\t" .
233
       $ratio_expected_falseneg{$rule} . "\t" .
356
       $ratio_expected_falseneg{$rule} . "\t" .
234
	$over_expected_falseneg{$rule} . "\t" .
357
	$over_expected_falseneg{$rule} . "\t" .
Lines 244-252 Link Here
244
       ($chisquare{$a} <=> $chisquare{$b}) ||
367
       ($chisquare{$a} <=> $chisquare{$b}) ||
245
	($ratio_expected_falseneg{$a} <=>
368
	($ratio_expected_falseneg{$a} <=>
246
	 $ratio_expected_falseneg{$b}) ||
369
	 $ratio_expected_falseneg{$b}) ||
247
	  ($freq_spam{$b} <=>
370
	  ($rules->{$b}->{freq_ham} <=>
248
	   $freq_spam{$a})} (@rules_falseneg_good);
371
	   $rules->{$a}->{freq_ham})} (@rules_falseneg_good);
249
    foreach $rule (@rules_falseneg_good) {
372
    foreach my $rule (@rules_falseneg_good) {
250
      print $rule . "\t" . $prob{$rule} . "\t" .
373
      print $rule . "\t" . $prob{$rule} . "\t" .
251
       $ratio_expected_falseneg{$rule} . "\t" .
374
       $ratio_expected_falseneg{$rule} . "\t" .
252
	$over_expected_falseneg{$rule} . "\t" .
375
	$over_expected_falseneg{$rule} . "\t" .
Lines 258-354 Link Here
258
}
381
}
259
382
260
exit;
383
exit;
261
262
sub readlogs {
263
  my $spam = $ARGV[0] || "spam.log";
264
  my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log");
265
266
267
  (open(NONSPAM,$nonspam)) ||
268
   (die "Couldn't open file '$nonspam': $!; stopped");
269
270
  while (defined($line = <NONSPAM>)) {
271
    if ($line =~ m/^\s*\#/) {
272
      next;
273
    } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) {
274
      my $tests = $1;
275
      my $hits = 0;
276
      my(@tests) = ();
277
      foreach $test (grep {length($_)} (split(/,+/,$tests))) {
278
	if (exists($rules{$test})) {
279
	  push @tests, $test;
280
	  $hits += $rules{$test}->{score};
281
	}
282
      }
283
      
284
      if (scalar(@tests)) {
285
	$num_nonspam++;
286
	foreach $test (grep {exists($freq_nonspam{$_})} (@tests)) {
287
	  $freq_nonspam{$test}++;
288
	}
289
	if ($hits >= $higher) {
290
	  $num_over_higher_falsepos++;
291
	  foreach $test (grep
292
			 {exists($freq_over_higher_falsepos{$_})} (@tests)) {
293
	    $freq_over_higher_falsepos{$test}++;
294
	  }
295
	}
296
      }
297
    } elsif ($line =~ m/\S/) {
298
      chomp($line);
299
      warn "Can't interpret line '$line'; skipping";
300
    }
301
  }
302
303
  close(NONSPAM);
304
305
  (open(SPAM,$spam)) || (die "Couldn't open file '$spam': $!; stopped");
306
307
  while (defined($line = <SPAM>)) {
308
    if ($line =~ m/^\s*\#/) {
309
      next;
310
    } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) {
311
      my $tests = $1;
312
      my $hits = 0;
313
      my $plus_hits = 0;
314
      my(@tests) = ();
315
      foreach $test (grep {length($_)} (split(/,+/,$tests))) {
316
	if (exists($rules{$test})) {
317
	  push @tests, $test;
318
	  $hits += $rules{$test}->{score};
319
	  if ($rules{$test}->{score} > 0) {
320
	    $plus_hits += $rules{$test}->{score};
321
	  }
322
	}
323
      }
324
      
325
      if (scalar(@tests)) {
326
	$num_spam++;
327
	foreach $test (grep {exists($freq_spam{$_})} (@tests)) {
328
	  $freq_spam{$test}++;
329
	}
330
	if (($hits <= $lower) && $plus_hits &&
331
	    ($plus_hits >= $lower)) {
332
	  $num_under_lower_falseneg++;
333
	  foreach $test (grep
334
			 {exists($freq_under_lower_falseneg{$_})} (@tests)) {
335
	    $freq_under_lower_falseneg{$test}++;
336
	  }
337
	}
338
      }
339
    } elsif ($line =~ m/\S/) {
340
      chomp($line);
341
      warn "Can't interpret line '$line'; skipping";
342
    }
343
  }
344
345
  close(SPAM);
346
}
347
348
349
sub readscores {
350
  system ("./parse-rules-for-masses") and
351
   die "Couldn't do parse-rules-for-masses: $?; stopped";
352
  require "./tmp/rules.pl";
353
}
354

Return to bug 2853