View | Details | Raw Unified | Return to bug 2853
Collapse All | Expand All

(-)lib/Mail/SpamAssassin.pm (-3 / +5 lines)
Lines 1263-1273 Link Here
1263
1263
1264
    # read a file called "init.pre" in site rules dir *before* all others;
1264
    # read a file called "init.pre" in site rules dir *before* all others;
1265
    # even the system config.
1265
    # even the system config.
1266
1267
    # Save this in $self so that it can be accessed externally (for logging, etc.)
1268
    $self->{site_rules_filename} ||= $self->first_existing_path (@site_rules_path);
1266
    my $siterules = $self->{site_rules_filename};
1269
    my $siterules = $self->{site_rules_filename};
1267
    $siterules ||= $self->first_existing_path (@site_rules_path);
1268
1270
1271
    $self->{rules_filename} ||= $self->first_existing_path (@default_rules_path);
1269
    my $sysrules = $self->{rules_filename};
1272
    my $sysrules = $self->{rules_filename};
1270
    $sysrules ||= $self->first_existing_path (@default_rules_path);
1271
1273
1272
    if ($siterules) {
1274
    if ($siterules) {
1273
      $fname = File::Spec->catfile ($siterules, "init.pre");
1275
      $fname = File::Spec->catfile ($siterules, "init.pre");
Lines 1300-1307 Link Here
1300
      $self->get_and_create_userstate_dir();
1302
      $self->get_and_create_userstate_dir();
1301
1303
1302
      # user prefs file
1304
      # user prefs file
1305
      $self->{userprefs_filename} ||= $self->first_existing_path (@default_userprefs_path);
1303
      $fname = $self->{userprefs_filename};
1306
      $fname = $self->{userprefs_filename};
1304
      $fname ||= $self->first_existing_path (@default_userprefs_path);
1305
1307
1306
      if (defined $fname) {
1308
      if (defined $fname) {
1307
        if (!-f $fname && !$self->{dont_copy_prefs} && !$self->create_default_prefs($fname)) {
1309
        if (!-f $fname && !$self->{dont_copy_prefs} && !$self->create_default_prefs($fname)) {
(-)lib/Mail/SpamAssassin/Masses.pm (+788 lines)
Line 0 Link Here
1
# <@LICENSE>
2
# Copyright 2004 Apache Software Foundation
3
#
4
# Licensed under the Apache License, Version 2.0 (the "License");
5
# you may not use this file except in compliance with the License.
6
# You may obtain a copy of the License at
7
#
8
#     http://www.apache.org/licenses/LICENSE-2.0
9
#
10
# Unless required by applicable law or agreed to in writing, software
11
# distributed under the License is distributed on an "AS IS" BASIS,
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
# See the License for the specific language governing permissions and
14
# limitations under the License.
15
# </@LICENSE>
16
17
=head1 NAME
18
19
Mail::SpamAssassin::Masses - Interface for reading and parsing rules
20
and mass-check logs for SpamAssassin
21
22
=head1 SYNOPSIS
23
24
  my $parser = Mail::SpamAssassin::Masses->new();
25
  my $rules = $parser->readrules();
26
  my $logs = $parser->readlogs();
27
28
  foreach my $test (keys %$rules) {
29
    if ($rules->{$test}->{score} > 1) {
30
      ...
31
    }
32
33
=head1 DESCRIPTION
34
35
Mail::SpamAssassin::Masses is a module to simplify the many scripts
36
that used to make up the SpamAssassin re-scoring process. By
37
consolidating all the shared code in one module, the scripts can be
38
simplified and require fewer temporary files.
39
40
=head1 METHODS
41
42
=over 4
43
44
=cut
45
46
package Mail::SpamAssassin::Masses;
47
48
use strict;
49
use warnings;
50
use Carp;
51
52
=item $parser = Mail::SpamAssassin::Masses->new( [ { opt => val, ... } ] );
53
54
Construct a new Mail::SpamAssassin::Masses object. You may pass the
55
following attribute-value pairs to the constructor.
56
57
=over 4
58
59
=item rulesdir
60
61
The directory containing rules. If multiple directories are desired,
62
an anonymous array should be passed.
63
64
=item scoreset
65
66
Scoreset to deal with.
67
68
=item logfile
69
70
Filename of mass-check log.
71
72
=item falses
73
74
Also count frequencies for false positives and false negatives from
75
the logs.
76
77
=item falsesonly
78
79
Only count false positives and false negatives.
80
81
=item greprule
82
83
Coderef that is passed a rule name and a hash ref with the entries
84
containing info about the rule. If the sub returns false, it is skipped.
85
86
=item greplog
87
88
Coderef that is passed a raw log entry. If it returns false, the entry
89
is skipped.
90
91
=item sliding_window
92
93
Use a sliding window for score ranges rather than a shrinking window.
94
95
=item nologs
96
97
Save memory by not saving the individual log results, just the
98
aggregate totals
99
100
=back
101
102
=cut
103
104
sub new {
105
106
  my $class = shift;
107
  $class = ref($class) || $class;
108
109
  my $self = shift;
110
  if (!defined $self){
111
    $self = { };
112
  }
113
114
  $self->{scoreset} ||= 0;
115
  $self->{rulesdir} ||= '';
116
  $self->{logfile} ||= "masses.log";
117
118
  bless($self, $class);
119
120
  return $self;
121
122
}
123
124
=item $parser->readrules()
125
126
Read and parse the rules from the directory specified as
127
C<rulesdir>. This loads the following keys and values into the hash
128
entry representing the rules (see below).
129
130
=over 4
131
132
=item name
133
134
Contains the rule's name.
135
136
=item score
137
138
Contains the rule's score.
139
140
=item type
141
142
Contains the rule's type (header, body, uri, etc.)
143
144
=item tflags
145
146
Contains the rules tflags (nice, autolearn, etc.) as specified in the config file.
147
148
=item lang
149
150
Set to the value of C<lang> for language-specific tests.
151
152
=item issubrule
153
154
Set to true if the rules is a sub-rule, (i.e. it starts with
155
__). Otherwise, undefined.
156
157
=item isnice
158
159
This key exists and is true if the rule is nice (i.e. with a score
160
that can be below zero).
161
162
=item describe
163
164
Set to the rule's description, in English, or in the rule's language.
165
166
=back
167
168
There may be more values once C<readlogs()> is run.
169
170
=cut
171
172
173
sub readrules {
174
175
  my $self = shift;
176
177
  $self->{rules} ||= { };
178
  my $rules = $self->{rules}; # $rules is a reference to the anon hash
179
180
  my @dirs = ref($self->{rulesdir}) ? @{$self->{rulesdir}} : $self->{rulesdir};
181
182
  my @files;
183
184
  foreach my $indir (@dirs) {
185
    if (-d $indir) {
186
      @files = glob("$indir/*.cf"); # no reason to only do numbered files
187
    } else {
188
      @files = ( $indir );
189
    }
190
191
    foreach my $file (@files) {
192
      open (IN, "<$file") || croak("Can't open $file, $!");
193
      while(<IN>) {
194
        s/#.*$//g;
195
        s/^\s+//;
196
        s/\s+$//;
197
        next if /^$/;
198
199
        my $lang = '';
200
        if (s/^lang\s+(\S+)\s+//) {
201
          $lang = lc $1;
202
        }
203
204
        if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) {
205
          my $type = $1;
206
          my $name = $2;
207
208
          $rules->{$name} ||= { };
209
	  $rules->{$name}->{name} = $name;
210
          $rules->{$name}->{type} = $type;
211
          $rules->{$name}->{lang} = $lang if $lang;
212
          $rules->{$name}->{tflags} = '';
213
214
          if ($name =~ /^__/) {
215
	    $rules->{$name}->{issubrule} = '1';
216
	  }
217
218
        } elsif (/^describe\s+(\S+)\s+(.+)$/) {
219
220
          # Let's get description in english, por favor -- unless the rule isn't english
221
222
	  next if ($lang && (!$rules->{$1}->{lang} || $rules->{$1}->{lang} ne $lang));
223
224
          $rules->{$1} ||= { };
225
          $rules->{$1}->{describe} = $2;
226
227
        } elsif (/^tflags\s+(\S+)\s+(.+)$/) {
228
	  my $name = $1;
229
          $rules->{$name} ||= { };
230
          $rules->{$name}->{tflags} = $2;
231
	  if ($2 =~ /nice/) {
232
	    $rules->{$name}->{isnice} = 1;
233
	  }
234
        } elsif (/^score\s+(\S+)\s+(.+)$/) {
235
          my($name,$score) = ($1,$2);
236
          $rules->{$name} ||= { };
237
          if ( $score =~ /\s/ ) { # there are multiple scores
238
            ($score) = (split(/\s+/,$score))[$self->{scoreset}];
239
          }
240
          $rules->{$name}->{score} = $score;
241
        }
242
      }
243
      close IN;
244
    }
245
  }
246
  foreach my $rule (keys %{$rules}) {
247
    if (!defined $rules->{$rule}->{type}) {
248
      delete $rules->{$rule};   # no rule definition -> no rule
249
      next;
250
    }
251
252
    if (!defined $rules->{$rule}->{score}) {
253
      my $def = 1.0;
254
      if ($rule =~ /^T_/) { $def = 0.01; }
255
256
      if ($rules->{$rule}->{isnice}) {
257
        $rules->{$rule}->{score} = -$def;
258
      } else {
259
        $rules->{$rule}->{score} = $def;
260
      }
261
    }
262
263
    if ($self->{greprules} && !&{$self->{greprules}}($rule, $rules->{$rule}))
264
    {
265
      delete $rules->{$rule};
266
      next;
267
    }
268
269
  }
270
271
  $self->{_readrules} = 1;
272
}
273
274
=item $parser->readlogs()
275
276
Read and parse logs from C<logsdir>. This will create the anonymous
277
array of hashes referred to by C<$parser->{logs}>, with the following
278
keys:
279
280
=over 4
281
282
=item isspam
283
284
True if the message is spam. False or undefined otherwise.
285
286
=item isfalse
287
288
True if the message was a false negative or positive.
289
290
=item tests_hit
291
292
Array reference containing references to the hash representing each
293
rule hit.
294
295
=item score
296
297
Score the message received (under current scores).
298
299
=back
300
301
In addition, this method adds the following keys to the rule
302
information in C<$parser->{rules}>.
303
304
=over 4
305
306
=item freq_spam
307
308
Frequency hit in spam.
309
310
=item freq_ham
311
312
Frequency hit in ham.
313
314
=item freq_fp
315
316
Frequency in false positives.
317
318
=item freq_fn
319
320
Frequency in false negatives.
321
322
=back
323
324
Also, sets C<$parser->{num_spam}> and C<$parser->{num_ham}> to the number of
325
spam logs read and the number of ham logs read, respectively.
326
327
=cut
328
329
sub readlogs {
330
331
  my $self = shift;
332
333
  if (!$self->{_readrules}) {
334
    # need to read scores first!
335
    $self->readrules();
336
  }
337
338
  my $rules = $self->{rules}; # copy the ref, shorthand
339
340
  my $logs;
341
  if (! $self->{nologs}) {
342
    $self->{logs} ||= [ ];
343
    $logs = $self->{logs};
344
  }
345
346
347
  my ($num_spam, $num_ham, $count, $num_fp, $num_fn);
348
  $num_spam = $num_ham = $count = $num_fp = $num_fn = 0;
349
350
  # First, initialize stuff
351
  foreach my $rule (values %{$self->{rules}}) {
352
    $rule->{freq_spam} ||= 0;
353
    $rule->{freq_ham} ||= 0;
354
355
    if($self->{falses}) {
356
      $rule->{freq_fp} ||= 0;
357
      $rule->{freq_fn} ||= 0;
358
    }
359
360
  }
361
362
  my $file = $self->{logfile};
363
  open (IN, "<$file");
364
365
  while (<IN>) {
366
    next if /^\#/;
367
    next if /^$/;
368
    if($_ !~ /^(.)\s+(.)\s+-?[\d.]+\s+\S+(\s+\S+\s+)/) { warn "bad line: $_"; next; }
369
370
    if ($self->{greplogs} && !&{$self->{greplogs}}($_)) {
371
      next;
372
    }
373
374
    my $manual = $1;
375
    my $result = $2;
376
    $_ = $3;
377
    s/(?:bayes|time)=\S+//;
378
    s/,,+/,/g;
379
    s/^\s+//;
380
    s/\s+$//;
381
382
383
    if ($manual ne $result) {
384
      $self->{isfalse} = 1;
385
    }
386
    elsif ($self->{falsesonly}) {
387
      next;
388
    }
389
390
    if ($manual eq "s") {
391
      $num_spam++;
392
      $logs->[$count]->{isspam} = 1 unless $self->{nologs};
393
      $num_fn++ if $result eq "h";
394
    } else {
395
      $num_ham++;
396
      $num_fp++ if $result eq "s";
397
    }
398
399
    my @tests = ();
400
    my $score = 0;
401
    foreach my $tst (split (/,/, $_)) {
402
      next if ($tst eq '');
403
404
      # Don't count non-existant rules
405
      # (Could happen with greprules)
406
      next if ( !$rules->{$tst} || !$rules->{$tst}->{type} );
407
408
      if ($manual eq "s") {
409
	  $rules->{$tst}->{freq_spam}++;
410
	  $rules->{$tst}->{freq_fn}++ if ($self->{falses} && $result eq "h");
411
      }
412
      else {
413
	  $rules->{$tst}->{freq_ham}++;
414
	  $rules->{$tst}->{freq_fp}++ if ($self->{falses} && $result eq "s");
415
      }
416
417
      $score += $rules->{$tst}->{score};
418
419
      push (@tests, $rules->{$tst}) unless $self->{nologs};
420
    }
421
422
    $logs->[$count]->{tests_hit} = \@tests unless $self->{nologs};
423
    $logs->[$count]->{score} = $score;
424
425
    $count++;
426
  }
427
  close IN;
428
429
  $self->{num_spam} = $num_spam;
430
  $self->{num_ham} = $num_ham;
431
  if ($self->{falses}) {
432
    $self->{num_fn} = $num_fn;
433
    $self->{num_fp} = $num_fp;
434
  }
435
436
  $self->{_readlogs} = 1; # Done reading logs
437
438
}
439
440
=item $parser->do_statistics();
441
442
Calculate the S/O ratio and the rank for each test.
443
444
This adds the following keys to the rules hashes.
445
446
=over 4
447
448
=item spam_percent
449
450
Percentage of spam messages hit.
451
452
=item ham_percent
453
454
Percentage of ham messages hit.
455
456
=item soratio
457
458
S/O ratio -- percentage of spam messages hit divided by total
459
percentage of messages hit.
460
461
=back
462
463
=cut
464
465
sub do_statistics {
466
  my $self = shift;
467
468
  if (! $self->{_readlogs} ) {
469
    $self->readlogs();
470
  }
471
472
  my $rank_hi=0;
473
  my $rank_lo=999999;
474
475
  foreach my $rule (values %{$self->{rules}}) {
476
477
    if (!$rule->{freq_spam}) {
478
      $rule->{spam_percent} = 0;
479
    } else {
480
      $rule->{spam_percent} = $rule->{freq_spam} / $self->{num_spam} * 100.0;
481
    }
482
483
    if (!$rule->{freq_ham}) {
484
      $rule->{ham_percent} = 0;
485
    } else {
486
      $rule->{ham_percent} = $rule->{freq_ham} / $self->{num_ham} * 100.0;
487
    }
488
489
    if (!$rule->{freq_spam} && !$rule->{freq_ham}) {
490
      $rule->{soratio} = 0.5;
491
      next;
492
    }
493
494
    $rule->{soratio} = $rule->{spam_percent} / ($rule->{spam_percent} + $rule->{ham_percent});
495
496
  }
497
498
  $self->{_statistics} = 1;
499
500
}
501
502
=item $parser->do_rank();
503
504
Calculates the ranking for each rule and stores this in the
505
appropriate key.
506
507
=over 4
508
509
=item rank
510
511
"Rank" of the rule. High numbers are good, low are bad.
512
513
=back
514
515
=cut
516
517
sub do_rank {
518
519
  my $self = shift;
520
521
  if (! $self->{_statistics} ) {
522
    $self->do_statistics();
523
  }
524
525
  my $rank_hi = 0;
526
  my $rank_lo = 9999999;
527
528
  my %unwanted;
529
  my %wanted;
530
  my %wranks = ();
531
  my %uranks = ();
532
  my $rules = $self->{rules};
533
534
535
  foreach my $rule (values %{$self->{rules}}) {
536
537
    $wanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_ham} : $rule->{freq_spam};
538
    $unwanted{$rule->{name}} = $rule->{isnice} ? $rule->{freq_spam} : $rule->{freq_ham};
539
540
    $wranks{$wanted{$rule->{name}}} = 1;
541
    $uranks{$unwanted{$rule->{name}}} = 1;
542
543
  }
544
545
  my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
546
  my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;
547
548
  # first half of ranking is the wanted rank
549
  my $position = 0;
550
  my $last = undef;
551
552
  foreach my $test (@wanted) {
553
    $position++ if defined $last && $last != $wanted{$test};
554
    $rules->{$test}->{rank} += $position;
555
    $last = $wanted{$test};
556
  }
557
558
  # second half is the unwanted rank
559
  $position = 0;
560
  $last = undef;
561
562
  # Avoid divide by 0 errors!
563
  die "Error: no rules read" if (!(scalar keys %uranks));
564
565
  my $normalize = (scalar keys %wranks) / (scalar keys %uranks);
566
567
  foreach my $test (@unwanted) {
568
    $position++ if defined $last && $last != $unwanted{$test};
569
    $rules->{$test}->{rank} += ($position * $normalize);
570
    $last = $unwanted{$test};
571
    $rank_hi = $rules->{$test}->{rank} if ($rules->{$test}->{rank} > $rank_hi);
572
    $rank_lo = $rules->{$test}->{rank} if ($rules->{$test}->{rank} < $rank_lo);
573
  }
574
575
  $rank_hi = $rank_hi - $rank_lo;
576
  foreach my $rule (values %{$rules}) {
577
    $rule->{rank} = ($rank_hi == 0) ? 0.001 : (($rule->{rank} - $rank_lo)/ $rank_hi);
578
  }
579
580
  $self->{_rank} = 1;
581
}
582
583
=item $parser->get_rules_array();
584
585
Returns a reference to an array of hash references. The values of
586
these hash have keys as listed above.
587
588
=cut
589
590
sub get_rules_array {
591
  my $self = shift;
592
  return [ values %{$self->{rules}} ];
593
}
594
595
=item $parser->get_rules_hash();
596
597
Returns a reference to a hash with rule names as keys and hash
598
references as values. The values of these hash have keys as listed
599
above.
600
601
=cut
602
603
sub get_rules_hash {
604
  my $self = shift;
605
  return $self->{rules};
606
}
607
608
=item $parser->get_logs();
609
610
Returns a reference to the array containing log entries, in the form
611
of anonymous hashes with keys as described above.
612
613
=cut
614
615
sub get_logs {
616
  my $self = shift;
617
  return $self->{logs};
618
}
619
620
=item $parser->get_num_ham();
621
622
Returns number of ham logs read.
623
624
=cut
625
626
sub get_num_ham {
627
  my $self = shift;
628
  return $self->{num_ham};
629
}
630
631
=item $parser->get_num_spam();
632
633
Returns number of spam logs read.
634
635
=cut
636
637
sub get_num_spam {
638
  my $self = shift;
639
  return $self->{num_spam};
640
}
641
642
=item $parser->do_score_ranges();
643
644
Figure out range in which score can be set based on the soratio, etc.
645
646
This is necessary so that the perceptron doesn't set silly
647
scores. (This may not be as much of a problem as it was with the old
648
GA.)
649
650
This adds the following keys to the rules hashes:
651
652
=over 4
653
654
=item ismutable
655
656
Determines whether the perceptron can select a score for this test.
657
658
=item range_lo
659
660
Determines the lowest score the perceptron can set.
661
662
=item range_hi
663
664
Determines the highest score the perceptron can set.
665
666
=cut
667
668
sub do_score_ranges() {
669
670
  my $self = shift;
671
672
  if ( !$self->{_statistics} ) {
673
    $self->do_statistics();
674
  }
675
  if ( !$self->{_rank} ) {
676
    $self->do_rank();
677
  }
678
679
  foreach my $rule (values %{$self->{rules}}) {
680
681
    my ($rank, $lo, $hi);
682
683
    $rank = $rule->{rank};
684
685
    # Get rid of rules that don't hit -- and disable completely.
686
    if ($rule->{spam_percent} + $rule->{ham_percent} < 0.01 ||
687
	$rule->{score} == 0) {
688
689
      $rule->{ismutable} = 0;
690
      $rule->{range_lo} = $rule->{range_hi} = 0;
691
      next;
692
693
    }
694
695
    # next: get rid of tests that don't apply in this scoreset
696
    # or are userconf -- set ismutable to 0, but keep the score
697
    if ($rule->{tflags} =~ /\buserconf\b/ ||
698
	(($self->{scoreset} % 2) == 0 && $rule->{tflags} =~/\bnet\b/)) {
699
700
      $rule->{ismutable} = 0;
701
      $rule->{range_lo} = $rule->{range_hi} = $rule->{score};
702
      next;
703
704
    }
705
706
707
    # Normal rules:
708
709
    # This seems to convert from [-1,1] to [0,1] but we're already in
710
    # [0,1] space - Is this right?
711
712
    # The current way ranks are calculated, > 0.5 and < 0.5 have no
713
    # special meaning
714
715
#      # 0.0 = best nice, 1.0 = best nonnice
716
#      if ($rule->{isnice}) {
717
#        $rank = .5 - ($rank / 2);
718
#      } else {
719
#        $rank = .5 + ($rank / 2);
720
#      }
721
722
    # using this seems to work better
723
724
    if($rule->{isnice}) {
725
      $hi = 0;
726
      $lo = $rule->{rank} * -4.5;
727
    } else {
728
      $hi = $rule->{rank} * 4.5;
729
      $lo = 0
730
    }
731
732
     # Modify good rules to be lower
733
     if ($rule->{isnice}) {
734
       if ($rule->{tflags} =~ /\blearn\b/) { # learn rules should get
735
                                             # higher scores (-5.4)
736
 	$lo *= 1.8;
737
       }
738
       elsif ( $rule->{soratio} <= 0.05 && $rule->{ham_percent} > 0.5) {
739
 	$lo *= 1.5;
740
       }
741
742
       # argh, ugly... but i'm copying it whole...
743
       $hi =	($rule->{soratio} == 0) ? $lo :
744
     		($rule->{soratio} <= 0.005 ) ? $lo/1.1 :
745
     		($rule->{soratio} <= 0.010 && $rule->{ham_percent} > 0.2) ? $lo/2.0 :
746
 		($rule->{soratio} <= 0.025 && $rule->{ham_percent} > 1.5) ? $lo/10.0 :
747
 		0;
748
749
       if ($rule->{soratio} >= 0.35 ) {
750
 	($lo, $hi) = (0,0);
751
       }
752
     }
753
     else { # Make non-nice rules have higher scores if they're good
754
       if ($rule->{tflags} =~ /\blearn\b/ ) {
755
 	$hi *= 1.8;
756
       }
757
       elsif ( $rule->{soratio} >= 0.99 && $rule->{spam_percent} > 1.0) {
758
 	$hi *= 1.5;
759
       }
760
761
       $lo =	($rule->{soratio} == 1) ? $hi:
762
     		($rule->{soratio} >= 0.995 ) ? $hi/4.0 :
763
     		($rule->{soratio} >= 0.990 && $rule->{spam_percent} > 1.0) ? $hi/8.0 :
764
 		($rule->{soratio} >= 0.900 && $rule->{spam_percent} > 10.0) ? $hi/24.0 :
765
 		0;
766
767
       if ($rule->{soratio} <= 0.65 ) { # auto-disable bad rules
768
 	($lo, $hi) = (0,0);
769
       }
770
     }
771
772
773
    # Some sanity checking
774
    if($hi < $lo) {
775
      ($lo, $hi) = ($hi, $lo);
776
    }
777
778
779
    $rule->{ismutable} = ($lo == $hi) ? 0 : 1;
780
    $rule->{range_lo} = $lo;
781
    $rule->{range_hi} = $hi;
782
783
  }
784
}
785
786
787
# Pacify perl
788
1;
0
  - craig-evolve.scores
789
  - craig-evolve.scores
1
  + craig-evolve.scores
790
  + craig-evolve.scores
(-)masses/parse-rules-for-masses (-148 lines)
Lines 1-148 Link Here
1
#!/usr/bin/perl
2
#
3
# <@LICENSE>
4
# Copyright 2004 Apache Software Foundation
5
# 
6
# Licensed under the Apache License, Version 2.0 (the "License");
7
# you may not use this file except in compliance with the License.
8
# You may obtain a copy of the License at
9
# 
10
#     http://www.apache.org/licenses/LICENSE-2.0
11
# 
12
# Unless required by applicable law or agreed to in writing, software
13
# distributed under the License is distributed on an "AS IS" BASIS,
14
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
# See the License for the specific language governing permissions and
16
# limitations under the License.
17
# </@LICENSE>
18
19
sub usage {
20
  die "
21
parse-rules-for-masses: parse the SpamAssassin rules files for mass-checks,
22
        evolving, and frequency analysis
23
24
usage: ./parse-rules-for-masses [-d rulesdir] [-o outputfile] [-s scoreset]
25
26
rulesdir defaults to ../rules
27
outputfile defaults to ./tmp/rules.pl
28
scoreset default to 0
29
30
";
31
}
32
33
use Getopt::Long;
34
use Data::Dumper;
35
36
use vars qw(@rulesdirs $outputfile $scoreset);
37
GetOptions (
38
                "d=s" => \@rulesdirs,
39
                "o=s" => \$outputfile,
40
		"s=i" => \$scoreset,
41
                "help|h|?" => sub { usage(); } );
42
43
if ($#rulesdirs < 0) {
44
  @rulesdirs = ("../rules");
45
}
46
47
if (!defined $outputfile) {
48
  $outputfile = "./tmp/rules.pl";
49
  mkdir ("tmp", 0755);
50
}
51
52
$scoreset = 0 if ( !defined $scoreset );
53
54
my $rules = { };
55
readrules(@rulesdirs);
56
57
my $scores = { };
58
foreach my $key (keys %{$rules}) {
59
  $scores->{$key} = $rules->{$key}->{score};
60
}
61
62
writerules($outputfile);
63
exit;
64
65
sub readrules {
66
  foreach my $indir (@_) {
67
    my @files = <$indir/[0-9]*.cf>;
68
    my $file;
69
    %rulesfound = ();
70
    %langs = ();
71
    foreach $file (sort @files) {
72
      open (IN, "<$file");
73
      while (<IN>) {
74
        s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;
75
76
        my $lang = '';
77
        if (s/^lang\s+(\S+)\s+//) {
78
          $lang = $1;
79
        }
80
81
        if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) {
82
          my $type = $1;
83
          my $name = $2;
84
85
          my $issubrule = '0';
86
          if ($name =~ /^__/) { $issubrule = '1'; }
87
88
          $rules->{$1} ||= { };
89
          $rules->{$name}->{type} = $type;
90
          $rules->{$name}->{lang} = $lang;
91
          $rules->{$name}->{issubrule} = $issubrule;
92
          $rules->{$name}->{tflags} = '';
93
94
        } elsif (/^describe\s+(\S+)\s+(.+)$/) {
95
          $rules->{$1} ||= { };
96
          $rules->{$1}->{describe} = $2;
97
98
        } elsif (/^tflags\s+(\S+)\s+(.+)$/) {
99
          $rules->{$1} ||= { };
100
          $rules->{$1}->{tflags} = $2;
101
102
        } elsif (/^score\s+(\S+)\s+(.+)$/) {
103
	  my($name,$score) = ($1,$2);
104
          $rules->{$name} ||= { };
105
	  if ( $score =~ /\s/ ) { # there are multiple scores
106
	    ($score) = (split(/\s+/,$score))[$scoreset];
107
	  }
108
          $rules->{$name}->{score} = $score;
109
        }
110
      }
111
      close IN;
112
    }
113
  }
114
115
  foreach my $rule (keys %{$rules}) {
116
    if (!defined $rules->{$rule}->{type}) {
117
      delete $rules->{$rule};   # no rule definition -> no rule
118
      next;
119
    }
120
121
    if (!defined $rules->{$rule}->{score}) {
122
      my $def = 1.0;
123
      if ($rule =~ /^T_/) { $def = 0.01; }
124
125
      if ($rules->{$rule}->{tflags} =~ /nice/) {
126
        $rules->{$rule}->{score} = -$def;
127
      } else {
128
        $rules->{$rule}->{score} = $def;
129
      }
130
    }
131
  }
132
}
133
134
sub writerules {
135
  my $outfile = shift;
136
  # quick hack to create the tmp directory
137
  system ("mkdir -p $outfile 2>/dev/null ; rmdir $outfile 2>/dev/null");
138
139
  open (OUT, ">$outfile") or die "cannot write to $outfile";
140
  print OUT "# dumped at ".`date`."\n";
141
142
  $Data::Dumper::Purity = 1;
143
  print OUT Data::Dumper->Dump ([$rules, $scores], ['*rules', '*scores']);
144
145
  print OUT "1;";
146
  close OUT;
147
}
148
(-)masses/hit-frequencies (-312 / +183 lines)
Lines 16-400 Link Here
16
# limitations under the License.
16
# limitations under the License.
17
# </@LICENSE>
17
# </@LICENSE>
18
18
19
19
use FindBin;
20
use FindBin;
20
use Getopt::Std;
21
use lib "$FindBin::Bin/../lib";
21
getopts("fm:M:X:l:L:pxhc:at:s:i");
22
use Mail::SpamAssassin::Masses;
23
use Getopt::Long qw(:config bundling auto_help);
24
use Pod::Usage;
25
use strict;
26
use warnings;
22
27
28
23
use vars qw {
29
use vars qw {
24
  $opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c
30
  $opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c
25
  $opt_a $opt_t $opt_s $opt_i $sorting
31
  $opt_a $opt_t $opt_s $opt_z $opt_inclang $opt_auto
26
};
32
};
27
33
28
sub usage {
34
GetOptions("c|cffile=s@" => \$opt_c,
29
  die "hit-frequencies [-c rules dir] [-f] [-m RE] [-M RE] [-X RE] [-l LC]
35
	   "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
30
                [-s SC] [-a] [-p] [-x] [-i] [spam log] [ham log]
36
	   "l|logfile=s" => \$opt_l,
37
	   "f|falses" => \$opt_f,
38
	   "a|all" => \$opt_a,
39
	   "p|percentages" => \$opt_p,
40
	   "x|extended" => \$opt_x,
41
	   "m|matchrule=s" => \$opt_m, #,
42
	   "t|tflags=s" => \$opt_t,
43
	   "M|matchlog=s" => \$opt_M,
44
	   "X|excludelog=s" => \$opt_X,
45
	   "L|language=s" => \$opt_L,
46
	   "include-language=s" => \$opt_inclang);
31
47
32
    -c p   use p as the rules directory
33
    -f     falses. count only false-negative or false-positive matches
34
    -m RE  print rules matching regular expression
35
    -t RE  print rules with tflags matching regular expression
36
    -M RE  only consider log entries matching regular expression
37
    -X RE  don't consider log entries matching regular expression
38
    -l LC  also print language specific rules for lang code LC (or 'all')
39
    -L LC  only print language specific rules for lang code LC (or 'all')
40
    -a     display all tests
41
    -p     percentages. implies -x
42
    -x     extended output, with S/O ratio and scores
43
    -s SC  which scoreset to use
44
    -i     use IG (information gain) for ranking
45
48
46
    options -l and -L are mutually exclusive.
49
=head1 NAME
47
50
48
    options -M and -X are *not* mutually exclusive.
51
hit-frequencies - Display statistics about tests hit by a mass-check run
49
52
50
    if either the spam or and ham logs are unspecified, the defaults
53
=head1 SYNOPSIS
51
    are \"spam.log\" and \"ham.log\" in the cwd.
52
54
53
";
55
hit-frequencies [options]
54
}
55
56
56
usage() if($opt_h || ($opt_l && $opt_L));
57
 Options:
58
    -c,--cffile=path	  Use path as the rules directory
59
    -s,--scoreset=n	  Use scoreset n
60
    -l,--logfile=file	  Read in file instead of masses.log
61
    -f			  Count only false-positives/false-negatives
62
    -a			  Report all tests (including subrules)
63
    -p			  Report percentages instead of raw hits
64
    -x			  "Extended" output, include RANK, S/O and SCORE
65
    -m,--matchrule=re     Print rules matching the regular expression
66
    -t,--tflags=re	  Print only rules with tflags matching the regular expression
67
    -M,--matchlog=re      Consider only logs matching the regular expression
68
    -X,--excludelog=re	  Exclude logs matching this regular expression
69
    -L,--language=lc	  Only print language specific tests for specified lang code (try 'all')
70
    --include-language=lc Also print language specific tests for specified lang code (try 'all')
57
71
58
if ($opt_p) {
72
=head1 DESCRIPTION
59
  $opt_x = 1;
60
}
61
73
62
$opt_s = 0 if ( !defined $opt_s );
74
B<hit-frequencies> will read the mass-check log F<masses.log> or the
75
log given by the B<--logfile> option. The output will contain a
76
summary of the number of ham and spam messages and detailed statistics
77
for each rule. By default, B<hit-frequencies> will try to guess the
78
proper values for B<--cffile> based on the header of the
79
masses.log. The output will include the following columns:
63
80
64
my $cffile = $opt_c || "$FindBin::Bin/../rules";
81
=over 4
65
82
66
my %freq_spam = ();
83
=item OVERALL
67
my %freq_ham = ();
68
my $num_spam = 0;
69
my $num_ham = 0;
70
my %ranking = ();
71
my $ok_lang = '';
72
84
73
readscores($cffile);
85
Number of times (or percentage with B<-p>) the rule hit on
86
all messages (spam or ham).
74
87
75
$ok_lang = lc ($opt_l || $opt_L || '');
88
=item SPAM
76
if ($ok_lang eq 'all') { $ok_lang = '.'; }
77
89
78
foreach my $key (keys %rules) {
90
Number of times (or percentage with B<-p>) the rule hit on
91
spam messages.
79
92
80
  if ( ($opt_L && !$rules{$key}->{lang}) ||
93
=item HAM
81
       ($rules{$key}->{lang} &&
82
         (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i)
83
     ) ) {
84
    delete $rules{$key} ; next;
85
  }
86
94
87
  $freq_spam{$key} = 0;
95
Number of times (or percentage with B<-p>) the rule hit on
88
  $freq_ham{$key} = 0;
96
ham messages.
89
}
90
97
91
readlogs();
98
=item S/O
92
99
93
my $hdr_all = $num_spam + $num_ham;
100
Shown only with B<-x> or B<-p>, this is the number of spam hits
94
my $hdr_spam = $num_spam;
101
divided by total number of hits (C<S/O> refers to spam divided by
95
my $hdr_ham = $num_ham;
102
overall).
96
103
97
if ($opt_p) {
104
=item RANK
98
  my $sorting = $opt_i ? "IG" : "RANK";
99
  if ($opt_f) {
100
    printf "%7s %7s %7s  %6s  %6s  %6s  %s\n",
101
  	"OVERALL%", "FNEG%", "FPOS%", "S/O", $sorting, "SCORE", "NAME";
102
  } else {
103
    printf "%7s %7s  %7s  %6s  %6s  %6s  %s\n",
104
  	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
105
  }
106
  printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
107
  	$hdr_all, $hdr_spam, $hdr_ham,
108
        soratio ($num_spam,$num_ham), 0, 0;
109
105
110
  $hdr_spam = ($num_spam / $hdr_all) * 100.0;
106
Shown only with B<-x> or B<-p>, this is a measure that attempts to
111
  $hdr_ham = ($num_ham / $hdr_all) * 100.0;
107
indicate how I<good> or I<useful> a test is. The higher it is, the
112
  $hdr_all = 100.0;             # this is obvious
108
better the test.
113
  printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  (all messages as %%)\n",
114
  	$hdr_all, $hdr_spam, $hdr_ham,
115
        soratio ($num_spam,$num_ham), 0, 0;
116
109
117
} elsif ($opt_x) {
110
=item SCORE
118
  printf "%7s %7s  %7s  %6s  %6s %6s  %s\n",
119
  	"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME";
120
  printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  (all messages)\n",
121
  	$hdr_all, $hdr_spam, $hdr_ham,
122
        soratio ($num_spam,$num_ham), 0, 0;
123
111
124
} else {
112
Shown only with B<-x> or B<-p>, this is the current score assigned to
125
  printf "%10s  %10s  %10s  %s\n",
113
the rule.
126
  	"OVERALL", "SPAM", "HAM", "NAME";
127
  printf "%10d  %10d  %10d  (all messages)\n",
128
  	$hdr_all, $hdr_spam, $hdr_ham;
129
}
130
114
131
my %done = ();
115
=item NAME
132
my @tests = ();
133
my $rank_hi = 0;
134
my $rank_lo = 9999999;
135
116
136
# variables for wanted/unwanted RANK
117
This is the rule's name.
137
my %wanted;
138
my %unwanted;
139
my %wranks;
140
my %uranks;
141
118
142
foreach my $test (keys %freq_spam, keys %freq_ham) {
119
=back
143
  next unless (exists $rules{$test});           # only valid tests
144
  next if (!$opt_a && $rules{$test}->{issubrule});
145
120
146
  next if $done{$test}; $done{$test} = 1;
121
=head1 BUGS
147
  push (@tests, $test);
148
122
149
  my $isnice = 0;
123
Please report bugs to http://bugzilla.spamassassin.org/
150
  if ($rules{$test}->{tflags} =~ /nice/) { $isnice = 1; }
151
124
152
  my $fs = $freq_spam{$test}; $fs ||= 0;
125
=head1 SEE ALSO
153
  my $fn = $freq_ham{$test}; $fn ||= 0;
154
  my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
155
  my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;
156
126
157
  my $soratio = $soratio{$test} = soratio ($fsadj, $fnadj);
127
L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
158
128
159
  if ($isnice) {
129
=cut
160
    $soratio = 1.0 - $soratio;
161
    my $tmp = $fsadj; $fsadj = $fnadj; $fnadj = $tmp;
162
  }
163
130
164
  if ($opt_i) {
131
if ($opt_L && $opt_inclang) {
165
    # come up with a ranking
132
  pod2usage("-L/--language and --include-language are mutually exclusive");
166
    my $rank;
167
168
    # New new system: from "Learning to Filter Unsolicited Commercial E-Mail",
169
    # Ion Androutsopoulos et al: determine the information gain IG(X, C) of the
170
    # Boolean attributes (ie. the rules). Measures "the average reduction in
171
    # the entropy of C (classification) given the value of X (the rule)". Makes
172
    # a good ranking measure with a proper statistical basis. ;)
173
    #
174
    # Still would like to get an entropy measure in, too.
175
    #
176
    #             sum                                    P(X = x ^ C = c)
177
    # IG(X,C) = x in [0, 1]    P(X = x ^ C = c) . log2( ------------------- )
178
    #           c in [Ch, Cs]                           P(X = x) . P(C = c)
179
    #
180
    my $safe_nspam = $num_spam || 0.0000001;
181
    my $safe_nham = $num_ham || 0.0000001;
182
183
    my $num_all = ($num_spam + $num_ham);
184
    my $safe_all = $num_all || 0.0000001;
185
    my $f_all = $fs+$fn;
186
187
    my $px0 = (($num_all - $f_all) / $safe_all);         # P(X = 0)
188
    my $px1 = ($f_all / $safe_all);                      # P(X = 1)
189
    my $pccs = ($num_spam / $safe_all);                  # P(C = Cs)
190
    my $pcch = ($num_ham / $safe_all);                   # P(C = Ch)
191
    my $px1ccs = ($fs / $safe_nspam);                   # P(X = 1 ^ C = Cs)
192
    my $px1cch = ($fn / $safe_nham);                    # P(X = 1 ^ C = Ch)
193
    my $px0ccs = (($num_spam - $fs) / $safe_nspam);     # P(X = 0 ^ C = Cs)
194
    my $px0cch = (($num_ham - $fn) / $safe_nham);       # P(X = 0 ^ C = Ch)
195
    my $safe_px0_dot_pccs = ($px0 * $pccs) || 0.00000001;
196
    my $safe_px0_dot_pcch = ($px0 * $pcch) || 0.00000001;
197
    my $safe_px1_dot_pccs = ($px1 * $pccs) || 0.00000001;
198
    my $safe_px1_dot_pcch = ($px1 * $pcch) || 0.00000001;
199
200
    sub log2 { return log($_[0]) / 0.693147180559945; } # log(2) = 0.6931...
201
202
    my $safe_px0ccs = ($px0ccs || 0.0000001);
203
    my $safe_px0cch = ($px0cch || 0.0000001);
204
    my $safe_px1ccs = ($px1ccs || 0.0000001);
205
    my $safe_px1cch = ($px1cch || 0.0000001);
206
    $rank = ( $px0ccs * log2($safe_px0ccs / $safe_px0_dot_pccs) ) +
207
                    ( $px0cch * log2($safe_px0cch / $safe_px0_dot_pcch) ) +
208
                    ( $px1ccs * log2($safe_px1ccs / $safe_px1_dot_pccs) ) +
209
                    ( $px1cch * log2($safe_px1cch / $safe_px1_dot_pcch) );
210
211
    $ranking{$test} = $rank;
212
    $rank_hi = $rank if ($rank > $rank_hi);
213
    $rank_lo = $rank if ($rank < $rank_lo);
214
  }
215
  else {
216
    # basic wanted/unwanted ranking
217
    $wanted{$test} = $isnice ? $fn : $fs;
218
    $unwanted{$test} = $isnice ? $fs : $fn;
219
    # count number of ranks of each type
220
    $wranks{$wanted{$test}} = 1;
221
    $uranks{$unwanted{$test}} = 1;
222
  }
223
}
133
}
224
134
225
# finish basic wanted/unwanted ranking
135
if ($opt_p) {
226
if (! $opt_i) {
136
  $opt_x = 1;
227
  my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted;
228
  my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted;
229
230
  # first half of ranking is the wanted rank
231
  my $position = 0;
232
  my $last = undef;
233
  for my $test (@wanted) {
234
    $position++ if defined $last && $last != $wanted{$test};
235
    $ranking{$test} += $position;
236
    $last = $wanted{$test}
237
  }
238
239
  # second half of ranking is the unwanted rank
240
  my $normalize = (scalar keys %wranks) / (scalar keys %uranks);
241
  $position = 0;
242
  $last = undef;
243
  for my $test (@unwanted) {
244
    $position++ if defined $last && $last != $unwanted{$test};
245
    $ranking{$test} += ($position * $normalize);
246
    $last = $unwanted{$test};
247
    $rank_hi = $ranking{$test} if ($ranking{$test} > $rank_hi);
248
    $rank_lo = $ranking{$test} if ($ranking{$test} < $rank_lo);
249
  }
250
}
137
}
251
138
252
{
139
$opt_s = 0 if ( !defined $opt_s );
253
  # now normalise the rankings to [0, 1]
254
  $rank_hi -= $rank_lo;
255
  foreach $test (@tests) {
256
    $ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi);
257
  }
258
}
259
140
260
foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) {
141
my $ok_lang = lc ( $opt_inclang || $opt_L || '');
261
  next unless (exists $rules{$test});           # only valid tests
142
$ok_lang = '.' if ($ok_lang eq 'all');
262
  next if (!$opt_a && $rules{$test}->{issubrule});
263
143
264
  my $fs = $freq_spam{$test}; $fs ||= 0;
144
my $greprules = sub { # To determine whether rule should be read
265
  my $fn = $freq_ham{$test}; $fn ||= 0;
145
  my ($name, $rule) = @_;
266
  my $fa = $fs+$fn;
267
146
268
  next if ($opt_m && $test !~ m/$opt_m/);	# match certain tests
147
  return 0 if ($opt_m && $name !~ /$opt_m/); # name doesn't match -m
269
  next if ($opt_t && $rules{$test}->{tflags} !~ /$opt_t/); # match tflags
148
                                             # expression
149
  return 0 if ($opt_t && $rule->{tflags} !~ /$opt_t/); # tflags don't
150
                                                       # match -t
151
                                                       # expression
152
  return 0 if (($opt_L && !$rule->{lang}) ||
153
	   ($rule->{lang} &&
154
	    (!$ok_lang || $rule->{lang} !~ /^$ok_lang/i))); # Wrong language
270
155
156
  return 0 if ($rule->{issubrule} && !$opt_a);
157
271
  if (!$opt_a && !$opt_t) {
158
  if (!$opt_a && !$opt_t) {
272
    next if ($rules{$test}->{tflags} =~ /net/ && ($opt_s % 2 == 0));   # not net tests
159
    return 0 if ($rule->{tflags} =~ /net/ && ($opt_s % 2 == 0));
273
    next if ($rules{$test}->{tflags} =~ /userconf/); # or userconf
160
    return 0 if ($rule->{tflags} =~ /userconf/); # or userconf
274
  }
161
  }
162
  return 1;
275
163
276
  # adjust based on corpora sizes (and cvt to % while we're at it)
164
};
277
  my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0;
278
  my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0;
279
165
280
  if ($opt_f && $fsadj == 0 && $fnadj == 0) { next; }
281
166
282
  if ($opt_p) {
167
my $logfile = $opt_l || "masses.log";
283
    $fa = ($fa / ($num_spam + $num_ham)) * 100.0;
284
    $fs = $fsadj;
285
    $fn = $fnadj;
286
  }
287
168
288
  my $soratio = $soratio{$test};
169
if (!$opt_c || !scalar(@$opt_c)) {
289
  if (!defined $soratio) {
170
    # Try to read this in from the log, if possible
290
    $soratio{$test} = soratio ($fsadj, $fnadj);
171
    open IN, $logfile or die "Can't open $logfile: $!";
291
  }
172
    my $files = 0; # are we in the files section?
173
    while(<IN>) {
174
	if (!$files) {
175
	    if (/^\# SVN revision:/) {
176
		$opt_c = [ "$FindBin::Bin/../rules" ];
177
		last;
178
	    } elsif (/^\# Using configuration:$/) {
179
		$files = 1;
180
	    }
181
	} elsif (/^\#\s+(.*)\s*$/) {
182
	    push (@$opt_c, $1);
183
	} else {
184
	    # All done!
185
	    last;
186
	}
187
    }
292
188
293
  if ($opt_p) {
189
    if (!defined $opt_c) {
294
    printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  %s\n",
190
      $opt_c = [ "$FindBin::Bin/../rules" ];
295
  	$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test;
191
    }
296
192
297
  } elsif ($opt_x) {
193
    foreach my $file (@$opt_c) {
298
    printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  %s\n",
194
	die "Can't read $file" unless -r $file;
299
  	$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test;
195
    }
300
301
  } else {
302
    printf "%10d  %10d  %10d  %s\n", $fa, $fs, $fn, $test;
303
  }
304
}
196
}
305
exit;
197
	    
198
my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
199
					       scoreset => $opt_s,
200
                                               falsesonly => $opt_f,
201
                                               greprules => $greprules,
202
                                               logfile => $logfile,
203
                                               nologs => 1});
306
204
205
$masses->readrules();
206
$masses->readlogs();
207
$masses->do_statistics();
208
$masses->do_rank();
307
209
210
my $rules = $masses->get_rules_hash();
211
my $num_ham = $masses->get_num_ham();
212
my $num_spam = $masses->get_num_spam();
213
my $num_all = $num_ham + $num_spam;
308
214
309
sub readlogs {
215
if ($num_ham + $num_spam <= 0) {
310
  my $spam = $ARGV[0] || "spam.log";
216
  die "Can't run hit-frequencies on 0 messages.";
311
  my $ham = $ARGV[1] || (-f "good.log" ? "good.log" : "ham.log");
217
}
312
218
313
  foreach my $file ($spam, $ham) {
219
## Write header
314
    open (IN, "<$file") || die "Could not open file '$file': $!";
315
220
316
    my $isspam = 0; ($file eq $spam) and $isspam = 1;
221
if ($opt_p) {
317
222
318
    while (<IN>) {
223
  if ($opt_f) {
319
      next if (/^#/);
224
    printf "%7s %7s %7s  %6s  %6s  %6s  %s\n",
320
      next unless (!$opt_M || /$opt_M/o);
225
  	"OVERALL%", "FNEG%", "FPOS%", "S/O", "RANK", "SCORE", "NAME";
321
      next if ($opt_X && /$opt_X/o);
226
  } else {
227
    printf "%7s %7s  %7s  %6s  %6s  %6s  %s\n",
228
  	"OVERALL%", "SPAM%", "HAM%", "S/O", "RANK", "SCORE", "NAME";
229
  }
322
230
323
      /^(.)\s+(-?\d+)\s+(\S+)\s*(\S*)/ or next;
231
  printf "%7d  %7d  %7d  %7.3f %6.2f  %6.2f  (all messages)\n",
324
      my $caught = ($1 eq 'Y');
232
  	$num_all, $num_spam, $num_ham,
325
      my $hits = $2;
233
        $num_spam / $num_all, 0, 0;
326
      $_ = $4; s/,,+/,/g;
327
234
328
      if ($isspam) {
235
  printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  (all messages as %%)\n",
329
        if ($opt_f) {
236
  	100.0, $num_spam / $num_all * 100.0, $num_ham / $num_all * 100.0,
330
          if (!$caught) { $num_spam++; }
237
        $num_spam / $num_all, 0, 0;
331
        } else {
332
          $num_spam++;
333
        }
334
      } else {
335
        if ($opt_f) {
336
          if ($caught) { $num_ham++; }
337
        } else {
338
          $num_ham++;
339
        }
340
      }
341
238
342
      my @tests = split (/,/, $_);
239
} elsif ($opt_x) {
343
      foreach my $t (@tests) {
240
  printf "%7s %7s  %7s  %6s  %6s %6s  %s\n",
344
	next if ($t eq '');
241
  	"OVERALL", "SPAM", "HAM", "S/O", "RANK", "SCORE", "NAME";
345
	if ($isspam) {
242
  printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  (all messages)\n",
346
          if ($opt_f) {
243
  	$num_all, $num_spam, $num_ham,
347
            if (!$caught) { $freq_spam{$t}++; }
244
        $num_spam / $num_all, 0, 0;
348
          } else {
349
            $freq_spam{$t}++;
350
          }
351
	} else {
352
          if ($opt_f) {
353
            if ($caught) { $freq_ham{$t}++; }
354
          } else {
355
            $freq_ham{$t}++;
356
          }
357
	}
358
      }
359
    }
360
    close IN;
361
  }
362
}
363
245
364
246
} else {
365
sub readscores {
247
  printf "%10s  %10s  %10s  %s\n",
366
  my($cffile) = @_;
248
  	"OVERALL", "SPAM", "HAM", "NAME";
367
  system ("$FindBin::Bin/parse-rules-for-masses -d \"$cffile\" -s $opt_s") and die;
249
  printf "%10d  %10d  %10d  (all messages)\n",
368
  require "./tmp/rules.pl";
250
  	$num_all, $num_spam, $num_ham;
369
}
251
}
370
252
371
sub soratio {
253
foreach my $test (sort { $rules->{$b}->{rank} <=> $rules->{$a}->{rank} } keys %{$rules}) {
372
  my ($s, $n) = @_;
373
254
374
  $s ||= 0;
255
  if ($opt_p) {
375
  $n ||= 0;
256
    printf "%7.3f  %7.4f  %7.4f  %7.3f %6.2f  %6.2f  %s\n",
376
257
  	($rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham}) / $num_all * 100.0,
377
  if ($s + $n > 0) {
258
        $rules->{$test}->{spam_percent}, $rules->{$test}->{ham_percent},
378
      return $s / ($s + $n);
259
        $rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test;
260
  } elsif ($opt_x) {
261
    printf "%7d  %7d  %7d  %7.3f %6.2f %6.2f  %s\n",
262
  	$rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham},
263
        $rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham},
264
        $rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test;
379
  } else {
265
  } else {
380
      return 0.5;		# no results -> not effective
266
    printf "%10d  %10d  %10d  %s\n",
267
        $rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham},
268
        $rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham}, $test;
381
  }
269
  }
382
}
270
}
383
271
384
sub tcr {
385
  my ($nspam, $nlegit, $nspamspam, $nlegitspam) = @_;
386
  my $nspamlegit = $nspam - $nspamspam;
387
  my $nlegitlegit = $nlegit - $nlegitspam;
388
389
  my $lambda = 99;
390
391
  my $werr = ($lambda * $nlegitspam + $nspamlegit)
392
                  / ($lambda * $nlegit + $nspam);
393
394
  my $werr_base = $nspam
395
                  / ($lambda * $nlegit + $nspam);
396
397
  $werr ||= 0.000001;     # avoid / by 0
398
  my $tcr = $werr_base / $werr;
399
  return $tcr;
400
}
(-)masses/perceptron.pod (+30 lines)
Line 0 Link Here
1
=head1 NAME
2
3
perceptron - Generate scores for SpamAssassin using the "Stochastic
4
Gradient Method"
5
6
=head1 SYNOPSIS
7
8
perceptron [options]
9
10
 Options:
11
  -p ham_preference 	Modifies tendency to prefer false negatives over
12
			false positives (default 2.0) (higher = less fp)
13
  -e num_epochs		Set number of passes to make (default 15)
14
  -l learning_rate	Modifies learning rate (default 2.0)
15
  -w weight_decay 	Scores multiplied by this value after each pass
16
			to prevent scores from getting too high
17
			(default off (1.0))
18
19
=head1 DESCRIPTION
20
21
This algorithm is used to optimize SpamAssassin scores, based on the
22
input given by B<logs-to-c>. At the time of writing, the output of
23
logs-to-c needs to be compiled into the source before perceptron can
24
be used, but this will be fixed soon, I hope.
25
26
=head1 SEE ALSO
27
28
L<logs-to-c(1)>
29
30
=cut
(-)masses/rewrite-cf-with-new-scores (-21 / +114 lines)
Lines 16-47 Link Here
16
# limitations under the License.
16
# limitations under the License.
17
# </@LICENSE>
17
# </@LICENSE>
18
18
19
=head1 NAME
20
21
rewrite-cf-with-new-scores - Rewrite SpamAssassin scores file with new
22
scores.
23
24
=head1 SYNOPSIS
25
26
rewrite-cf-with-new-scores [options]
27
28
  Options
29
  --old-scores=file    Read file containing the old SpamAssassin scores
30
  --new-scores=file    Read file containing the new SpamAssassin scores
31
  -s,--scoreset n      Rewrite scoreset n
32
  --output=file        Output rewritten score file to file
33
  -c,--cffile=path     Use path as the rules directory
34
  -l,--logfile=file    Use file instead of masses.log (for guessing -c)
35
36
 Note: these options can be shortened (i.e. --old, --new, --out) as
37
 long as they are unambiguous.
38
39
=head1 DESCRIPTION
40
41
B<rewrite-cf-with-new-scores> is a tool to update the sitewide scores
42
file with the newly generated scores. Since SpamAssassin has four
43
different scoresets, which each need to be generated separately, this
44
tool is used to only change the correct scoreset.
45
46
By default, the old scores are read from 50_scores.cf in the rules
47
directory and the new ones from ./perceptron.scores. The output will
48
be ./50_scores.cf by default.
49
50
The rules directory needs to be used to make sure scores are given for
51
the right tests. Rules not found in the rules directory will not be
52
given scores in the output.
53
54
=head1 BUGS
55
56
Please report bugs to http://bugzilla.spamassassin.org/
57
58
=head1 SEE ALSO
59
60
L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
61
62
=cut
63
64
use FindBin;
65
use lib "$FindBin::Bin/../lib";
66
use Getopt::Long qw(:config bundling auto_help);
67
use Mail::SpamAssassin::Masses;
68
use Pod::Usage;
69
use strict;
70
use warnings;
71
72
use vars qw($opt_old $opt_new $opt_scoreset $opt_out $opt_c $opt_l);
73
74
GetOptions("old-scores=s" => \$opt_old,
75
	   "new-scores=s" => \$opt_new,
76
	   "s|scoreset=i" => \$opt_scoreset,
77
	   "output=s" => \$opt_out,
78
	   "c|cffile=s@" => \$opt_c,
79
	   "l|logfile=s" => \$opt_l);
80
81
$opt_l ||= "masses.log";
82
$opt_scoreset = 0 unless defined $opt_scoreset;
83
19
my $NUM_SCORESETS = 4;
84
my $NUM_SCORESETS = 4;
20
85
21
my ($scoreset,$oldscores,$newscores) = @ARGV;
86
if (!$opt_c || !scalar(@$opt_c)) {
87
    # Try to read this in from the log, if possible
88
    open IN, $opt_l or die "Can't open $opt_l: $!";
89
    my $files = 0; # are we in the files section?
90
    while(<IN>) {
91
	if (!$files) {
92
	    if (/^\# SVN revision:/) {
93
		$opt_c = [ "$FindBin::Bin/../rules" ];
94
		last;
95
	    } elsif (/^\# Using configuration:$/) {
96
		$files = 1;
97
	    }
98
	} elsif (/^\#\s+(.*)\s*$/) {
99
	    push (@$opt_c, $1);
100
	} else {
101
	    # All done!
102
	    last;
103
	}
104
    }
22
105
23
$scoreset = int($scoreset) if defined $scoreset;
106
    if (!defined $opt_c) {
24
if (!defined $newscores || $scoreset < 0 || $scoreset >= $NUM_SCORESETS ) {
107
      $opt_c = [ "$FindBin::Bin/../rules" ];
25
  die "usage: rewrite-cf-with-new-scores scoreset oldscores.cf newscores.cf\n";
108
    }
109
110
    foreach my $file (@$opt_c) {
111
	die "Can't read $file" unless -r $file;
112
    }
26
}
113
}
27
114
28
system ("./parse-rules-for-masses -s $scoreset") and die;
115
if (!$opt_old) {
29
if (-e "tmp/rules.pl") {
116
  $opt_old = $$opt_c[0] . "/50_scores.cf";
30
  # Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem
31
  require "./tmp/rules.pl";
32
}
117
}
33
else {
34
  die "parse-rules-for-masses had no error but no tmp/rules.pl!?!";
35
}
36
118
119
$opt_new ||= "50_scores.cf";
120
121
my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
122
					       scoreset => $opt_scoreset});
123
124
$masses->readrules();
125
my $rules = $masses->get_rules_hash();
126
37
# now read the generated scores
127
# now read the generated scores
38
my @gascoreorder = ();
128
my @gascoreorder = ();
129
my %oldscores = ();
39
my %gascorelines = ();
130
my %gascorelines = ();
40
open (STDIN, "<$newscores") or die "cannot open $newscores";
131
open (STDIN, "<$opt_new") or die "cannot open $opt_new";
41
while (<STDIN>) {
132
while (<STDIN>) {
42
  /^score\s+(\S+)\s+(-?\d+(?:\.\d+)?)/ or next;
133
  /^score\s+(\S+)\s+(-?\d+(?:\.\d+)?)/ or next;
43
  my $name = $1;  my $score = $2;
134
  my $name = $1;  my $score = $2;
44
  next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);
135
  next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule});
45
  next if ($name =~ /^__/);
136
  next if ($name =~ /^__/);
46
  next if ($name eq '(null)');	# er, oops ;)
137
  next if ($name eq '(null)');	# er, oops ;)
47
138
Lines 49-55 Link Here
49
  push (@gascoreorder, $name);
140
  push (@gascoreorder, $name);
50
}
141
}
51
142
52
open (IN, "<$oldscores") or die "cannot open $oldscores";
143
open (IN, "<$opt_old") or die "cannot open $opt_old";
53
my $out = '';
144
my $out = '';
54
my $pre = '';
145
my $pre = '';
55
146
Lines 58-64 Link Here
58
while (<IN>) {
149
while (<IN>) {
59
  if (/^\s*score\s+(\S+)\s/) {
150
  if (/^\s*score\s+(\S+)\s/) {
60
    delete $gascorelines{$1};
151
    delete $gascorelines{$1};
61
    next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);
152
    next unless (exists ($rules->{$1}) && $rules->{$1}->{issubrule} == 0);
62
  }
153
  }
63
  $pre .= $_;
154
  $pre .= $_;
64
  /^# Start of generated scores/ and last;
155
  /^# Start of generated scores/ and last;
Lines 82-91 Link Here
82
  if (/^\s*score\s+\S+/) {
173
  if (/^\s*score\s+\S+/) {
83
    my($score,$name,@scores) = split;
174
    my($score,$name,@scores) = split;
84
175
85
    next unless (exists ($rules{$name}) && $rules{$name}->{issubrule} == 0);
176
    next unless (exists ($rules->{$name}) && !$rules->{$name}->{issubrule});
86
    if (defined $gascorelines{$name}) {
177
    if (defined $gascorelines{$name}) {
87
      # Set appropriate scoreset value
178
      # Set appropriate scoreset value
88
      $scores[$scoreset] = $gascorelines{$name};
179
      $scores[$opt_scoreset] = $gascorelines{$name};
89
180
90
      # Create new score line
181
      # Create new score line
91
      $_ = join(" ","score",$name,generate_scores(@scores))."\n";
182
      $_ = join(" ","score",$name,generate_scores(@scores))."\n";
Lines 96-103 Link Here
96
}
187
}
97
close IN;
188
close IN;
98
189
190
open OUT, ">$opt_out" or die "Can't open $opt_out: $!";
191
99
# and output the lot
192
# and output the lot
100
print $pre, "\n";
193
print OUT $pre, "\n";
101
foreach my $name (@gascoreorder) {
194
foreach my $name (@gascoreorder) {
102
  $_ = $gascorelines{$name};
195
  $_ = $gascorelines{$name};
103
  next unless (defined ($_));
196
  next unless (defined ($_));
Lines 107-118 Link Here
107
  @scores = @{$oldscores{$name}} if ( exists $oldscores{$name} );
200
  @scores = @{$oldscores{$name}} if ( exists $oldscores{$name} );
108
201
109
  # Set appropriate scoreset value
202
  # Set appropriate scoreset value
110
  $scores[$scoreset] = $_;
203
  $scores[$opt_scoreset] = $_;
111
204
112
  # Create new score line
205
  # Create new score line
113
  print join(" ","score",$name,generate_scores(@scores)),"\n";
206
  print OUT join(" ","score",$name,generate_scores(@scores)),"\n";
114
}
207
}
115
print "\n", $out, "\n";
208
print OUT "\n", $out, "\n";
116
209
117
sub generate_scores {
210
sub generate_scores {
118
  my (@scores) = @_;
211
  my (@scores) = @_;
(-)masses/mboxget (-45 lines)
Lines 1-45 Link Here
1
#!/usr/bin/perl -w
2
3
# mboxget - get a message from a mailbox
4
#
5
# usage: mboxget [mass-check-mbox-id ...]
6
#
7
# <@LICENSE>
8
# Copyright 2004 Apache Software Foundation
9
# 
10
# Licensed under the Apache License, Version 2.0 (the "License");
11
# you may not use this file except in compliance with the License.
12
# You may obtain a copy of the License at
13
# 
14
#     http://www.apache.org/licenses/LICENSE-2.0
15
# 
16
# Unless required by applicable law or agreed to in writing, software
17
# distributed under the License is distributed on an "AS IS" BASIS,
18
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
# See the License for the specific language governing permissions and
20
# limitations under the License.
21
# </@LICENSE>
22
23
use strict;
24
25
my $prog = $0;
26
$prog =~ s@.*/@@;
27
28
foreach my $where (@ARGV) {
29
  my ($file, $offset) = ($where =~ m/(.*?)(?:\.(\d+))?$/);
30
  open(INPUT, $file) || die("$prog: open $file failed: $!\n");
31
  if ($offset) {
32
    seek(INPUT, $offset, 0) || die("$prog: seek $offset failed: $!\n");
33
  }
34
  my $past = 0;
35
  while (<INPUT>) {
36
    if ($past) {
37
      last if substr($_,0,5) eq "From ";
38
    }
39
    else {
40
      $past = 1;
41
    }
42
    print $_;
43
  }
44
  close INPUT;
45
}
(-)masses/rule-qa/corpus-nightly (-3 / +3 lines)
Lines 81-94 Link Here
81
date > test.end
81
date > test.end
82
82
83
# results name
83
# results name
84
mv spam.log spam-$net$username.log
84
mv masses.log masses-$net$username.log
85
mv ham.log ham-$net$username.log
86
85
87
# rsync
86
# rsync
88
set +e
87
set +e
89
retry=0
88
retry=0
90
while true; do
89
while true; do
91
	if rsync -CPcvuzb --timeout=120 spam-$net$username.log ham-$net$username.log $username@rsync.spamassassin.org::corpus/; then
90
	if rsync -CPcvuzb --timeout=120 masses-$net$username.log $username@rsync.spamassassin.org::corpus/; then
92
		break;
91
		break;
93
	fi
92
	fi
94
	if [ $retry -eq 120 ]; then
93
	if [ $retry -eq 120 ]; then
Lines 99-101 Link Here
99
	sleep 30
98
	sleep 30
100
done
99
done
101
set -e
100
set -e
101
(-)masses/rule-qa/corpus-hourly (-91 / +54 lines)
Lines 92-98 Link Here
92
    @files = sort readdir(CORPUS);
92
    @files = sort readdir(CORPUS);
93
    closedir(CORPUS);
93
    closedir(CORPUS);
94
94
95
    @files = grep { /^(?:spam|ham)-(?:net-)?\w+\.log$/ && -f "$opt{corpus}/$_" && -M _ < 10 } @files;
95
    @files = grep { /^masses-(?:net-)?\w+\.log$/ && -f "$opt{corpus}/$_" && -M _ < 10 } @files;
96
    @files = grep {
96
    @files = grep {
97
	my $time = 0;
97
	my $time = 0;
98
	my $tag = 0;
98
	my $tag = 0;
Lines 109-114 Link Here
109
	}
109
	}
110
	$time;
110
	$time;
111
    } @files;
111
    } @files;
112
112
}
113
}
113
114
114
sub rename {
115
sub rename {
Lines 158-288 Link Here
158
159
159
	    next if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/);
160
	    next if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/);
160
161
161
	    my @ham = grep { /^ham/ } @files;
162
	    print STDERR "logs: " . join(' ', @files) . "\n";
162
	    my @spam = grep { /^spam/ } @files;
163
163
164
	    print STDERR "ham: " . join(' ', @ham) . "\n";
165
	    print STDERR "spam: " . join(' ', @spam) . "\n";
166
167
	    chdir $opt{corpus};
164
	    chdir $opt{corpus};
168
165
169
	    # net vs. local
166
	    # net vs. local
170
	    if ($class eq "NET") {
167
	    if ($class eq "NET") {
171
		@ham = grep { /-net-/ } @ham;
168
		@files = grep { /-net-/ } @files;
172
		@spam = grep { /-net-/ } @spam;
169
		print STDERR "logs: " . join(' ', @files) . "\n";
173
		print STDERR "ham: " . join(' ', @ham) . "\n";
174
		print STDERR "spam: " . join(' ', @spam) . "\n";
175
	    }
170
	    }
176
	    else {
171
	    else {
177
		# if both net and local exist, use newer
172
		# if both net and local exist, use newer
178
		my %spam;
179
		my %ham;
180
		
173
		
181
		for my $file (@spam) {
174
		for my $file (@files) {
182
		    $spam{$1}++ if ($file =~ m/-(\w+)\.log$/);
175
		    $logs{$1}++ if ($file =~ m/-(\w+)\.log$/);
183
		}
176
		}
184
		for my $file (@ham) {
177
		while (my ($user, $count) = each %logs) {
185
		    $ham{$1}++ if ($file =~ m/-(\w+)\.log$/);
186
		}
187
		while (my ($user, $count) = each %ham) {
188
		    if ($count > 1) {
178
		    if ($count > 1) {
189
			my $nightly = "ham-$user.log";
179
			my $nightly = "masses-$user.log";
190
			my $weekly = "ham-net-$user.log";
180
			my $weekly = "masses-net-$user.log";
191
			if ($revision{$nightly} >= $revision{$weekly}) {
181
			if ($revision{$nightly} >= $revision{$weekly}) {
192
			    @ham = grep { $_ ne $weekly } @ham;
182
			    @files = grep { $_ ne $weekly } @files;
193
			}
183
			}
194
			else {
184
			else {
195
			    @ham = grep { $_ ne $nightly } @ham;
185
			    @files = grep { $_ ne $nightly } @files;
196
			}
186
			}
197
		    }
187
		    }
198
		}
188
		}
199
		while (my ($user, $count) = each %spam) {
189
		print STDERR "logs: " . join(' ', @files) . "\n";
200
		    if ($count > 1) {
201
			my $nightly = "spam-$user.log";
202
			my $weekly = "spam-net-$user.log";
203
			if ($revision{$nightly} >= $revision{$weekly}) {
204
			    @spam = grep { $_ ne $weekly } @spam;
205
			}
206
			else {
207
			    @spam = grep { $_ ne $nightly } @spam;
208
			}
209
		    }
210
		}
211
		print STDERR "ham: " . join(' ', @ham) . "\n";
212
		print STDERR "spam: " . join(' ', @spam) . "\n";
213
	    }
190
	    }
214
	    
191
	    
215
	    # age
192
	    # age
216
	    if ($class eq "NET" && $age ne "7day") {
193
	    if ($class eq "NET" && $age ne "7day") {
217
		@ham = grep { -M "$_" < 10 } @ham;
194
		@files = grep { -M "$_" < 10 } @files;
218
		@spam = grep { -M "$_" < 10 } @spam;
219
		# find most recent CVS revision
195
		# find most recent CVS revision
220
		my $wanted = 0.0;
196
		my $wanted = 0.0;
221
		for (@spam, @ham) {
197
		for (@spam, @ham) {
222
		    $wanted = $revision{$_} if ($revision{$_} > $wanted);
198
		    $wanted = $revision{$_} if ($revision{$_} > $wanted);
223
		}
199
		}
224
		@spam = grep { $revision{$_} eq $wanted } @spam;
200
		@files = grep { $revision{$_} eq $wanted } @files;
225
		@ham = grep { $revision{$_} eq $wanted } @ham;
201
226
		print STDERR "ham: " . join(' ', @ham) . "\n";
202
		print STDERR "logs: " . join(' ', @files) . "\n";
227
		print STDERR "spam: " . join(' ', @spam) . "\n";
228
	    }
203
	    }
229
	    elsif ($age =~ /^(?:new|all|age)$/) {
204
	    elsif ($age =~ /^(?:new|all|age)$/) {
230
		@ham = grep { -M "$_" < -M $opt{tagtime} } @ham;
205
		@files = grep { -M "$_" < -M $opt{tagtime} } @files;
231
		@spam = grep { -M "$_" < -M $opt{tagtime} } @spam;
206
232
		@ham = grep { $revision{$_} eq $revision } @ham;
207
		@files = grep { $revision{$_} eq $revision } @files;
233
		@spam = grep { $revision{$_} eq $revision } @spam;
208
234
		print STDERR "ham: " . join(' ', @ham) . "\n";
209
		print STDERR "logs: " . join(' ', @files) . "\n";
235
		print STDERR "spam: " . join(' ', @spam) . "\n";
236
	    }
210
	    }
237
	    elsif ($age =~ /(\d+)day/) {
211
	    elsif ($age =~ /(\d+)day/) {
238
		my $mtime = $1;
212
		my $mtime = $1;
239
		@ham = grep { -M "$_" < $mtime } @ham;
213
		@files = grep { -M "$_" < $mtime } @files;
240
		@spam = grep { -M "$_" < $mtime } @spam;
214
241
		print STDERR "ham: " . join(' ', @ham) . "\n";
215
		print STDERR "logs: " . join(' ', @files) . "\n";
242
		print STDERR "spam: " . join(' ', @spam) . "\n";
243
	    }
216
	    }
244
	    
217
	    
245
	    open(OUT, "> $opt{html}/$class.$age");
218
	    open(OUT, "> $opt{html}/$class.$age");
246
	    print OUT "# ham results used: " . join(" ", @ham) . "\n";
219
	    print OUT "# results used: " . join(" ", @files) . "\n";
247
	    print OUT "# spam results used: " . join(" ", @spam) . "\n";
220
248
	    for (@ham) {
221
	    for (@files) {
249
		print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision;
222
		print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision;
250
	    }
223
	    }
251
	    for (@spam) {
252
		print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision;
253
	    }
254
224
255
	    my $flags = "";
225
	    my $flags = "";
256
	    $flags = "-t net -s 1" if $class eq "NET";
226
	    $flags = "-t net -s 1" if $class eq "NET";
257
	    $flags = "-M HTML_MESSAGE" if $class eq "HTML";
227
	    $flags = "-M HTML_MESSAGE" if $class eq "HTML";
258
228
259
	    if ($age eq "all") {
229
	    if ($age eq "all") {
260
		my %spam;
230
		my %logs;
261
		my %ham;
262
		my @output;
231
		my @output;
263
		
232
		
264
		for my $file (@spam) {
233
		for my $file (@files) {
265
		    $spam{$1} = $file if ($file =~ m/-(\w+)\.log$/);
234
		    $logs{$1} = $file if ($file =~ m/-(\w+)\.log$/);
266
		}
235
		}
267
		for my $file (@ham) {
236
268
		    $ham{$1} = $file if ($file =~ m/-(\w+)\.log$/);
237
		unlink "$opt{tmp}/masses.log.$$";
269
		}
238
270
		unlink "$opt{tmp}/ham.log.$$";
239
		next unless (scalar keys %logs);
271
		unlink "$opt{tmp}/spam.log.$$";
240
		for my $user (sort keys %logs) {
272
		next unless (scalar keys %spam && scalar keys %ham);
241
273
		for my $user (sort keys %spam) {
274
		    next unless defined $ham{$user};
275
		    chdir "$opt{tree}/masses";
242
		    chdir "$opt{tree}/masses";
276
		    system("cat $opt{corpus}/$ham{$user} >> $opt{tmp}/ham.log.$$");
243
		    system("cat $opt{corpus}/$logs{$user} >> $opt{tmp}/masses.log.$$");
277
		    system("cat $opt{corpus}/$spam{$user} >> $opt{tmp}/spam.log.$$");
244
		    open(IN, "./hit-frequencies -xpa $flags -l $opt{corpus}/$logs{$user} |");
278
		    open(IN, "./hit-frequencies -xpa $flags $opt{corpus}/$spam{$user} $opt{corpus}/$ham{$user} |");
279
		    while(<IN>) {
245
		    while(<IN>) {
280
			chomp;
246
			chomp;
281
			push @output, "$_:$user\n";
247
			push @output, "$_:$user\n";
282
		    }
248
		    }
283
		    close(IN);
249
		    close(IN);
284
		}
250
		}
285
		open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
251
		open(IN, "./hit-frequencies -xpa $flags -l $opt{tmp}/masses.log.$$ |");
286
		while(<IN>) {
252
		while(<IN>) {
287
		    push @output, $_;
253
		    push @output, $_;
288
		}
254
		}
Lines 298-318 Link Here
298
		    my ($after, $before) = split(/-/, $which);
264
		    my ($after, $before) = split(/-/, $which);
299
		    # get and filter logs
265
		    # get and filter logs
300
		    chdir $opt{corpus};
266
		    chdir $opt{corpus};
301
		    for my $type (("ham", "spam")) {
267
302
			open(TMP, "> $opt{tmp}/$type.log.$$");
268
		    open(TMP, "> $opt{tmp}/masses.log.$$");
303
			my @array = ($type eq "ham") ? @ham : @spam;
269
		    for my $file (@files) {
304
			for my $file (@array) {
270
		      open(IN, $file);
305
			    open(IN, $file);
271
		      while (<IN>) {
306
			    while (<IN>) {
272
			print TMP $_ if time_filter($after, $before);
307
				print TMP $_ if time_filter($after, $before);
273
		      }
308
			    }
274
		      close(IN);
309
			    close(IN);
310
			}
311
			close (TMP);
312
		    }
275
		    }
276
		    close (TMP);
277
313
		    # print out by age
278
		    # print out by age
314
		    chdir "$opt{tree}/masses";
279
		    chdir "$opt{tree}/masses";
315
		    open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
280
		    open(IN, "./hit-frequencies -xpa $flags -l $opt{tmp}/masses.log.$$ |");
316
		    while(<IN>) {
281
		    while(<IN>) {
317
			chomp;
282
			chomp;
318
			push @output, "$_:$which\n";
283
			push @output, "$_:$which\n";
Lines 323-335 Link Here
323
		    print OUT $_;
288
		    print OUT $_;
324
		}
289
		}
325
	    }
290
	    }
326
	    elsif (@ham && @spam) {
291
	    elsif (@files) {
327
		# get logs
292
		# get logs
328
		system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$");
293
		system("cat " . join(" ", @files) . " > $opt{tmp}/masses.log.$$");
329
		system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$");
330
	
331
		chdir "$opt{tree}/masses";
294
		chdir "$opt{tree}/masses";
332
		open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
295
		open(IN, "./hit-frequencies -xpa $flags -l $opt{tmp}/masses.log.$$ |");
333
		while(<IN>) {
296
		while(<IN>) {
334
		    print(OUT);
297
		    print(OUT);
335
		}
298
		}
(-)masses/README.user (+375 lines)
Line 0 Link Here
1
2
HOW TO GENERATE YOUR OWN SCORES FOR SPAMASSASSN
3
-----------------------------------------------
4
5
Duncan Findlay
6
<duncf@debian.org>
7
8
9
1. Introduction
10
11
One of the reasons SpamAssassin is so accurate is that it's scores are
12
carefully optimized based on collections (aka. corpus, plural:
13
corpora) of mail from volunteers all across the world. Each volunteer
14
uses a script ("mass-check") to run SpamAssassin over each piece of
15
mail in their corpus. They then submit the results to a central server
16
where the SpamAssassin development team runs the scoring mechanism to
17
generate optimal scores.
18
19
SpamAssassin uses four different scoresets depending on the options
20
used. These are almost always referred to by number, as shown below:
21
22
Scoreset
23
   0	 - Network tests disabled, Bayes disabled
24
   1     - Network tests enabled, Bayes disabled
25
   2     - Network tests disabled, Bayes enabled
26
   3     - Network tests enabled, Bayes enabled
27
28
Things are further complicated by the fact that when Bayes is enabled,
29
it automatically learns using the equivalent scoreset with Bayes
30
disabled. As a result, optimal scores for scoresets 2 and 3 can only
31
be generated after scoresets 0 and 1. Set 0 logs can be generated from
32
set 1 logs, but sets 2 and 3 need to be done separately.
33
34
As a result, volunteers who take part in our rescoring survey need to
35
run 3 mass-checks, each of which can take many hours. Since the
36
generation of scores is such a labourious process, the SpamAssassin
37
developers only perform this once per release.
38
39
Luckily, the previous score optimizer, a Genetic Algorithm, which took
40
almost 24 hours to optimize scores for one scoreset has been replaced
41
with the Perceptron (thanks to Henry Stern) which uses a "Stochastic
42
Gradient Descent" method. Don't worry if you don't understand what
43
this means, I certainly don't. The Perceptron takes less than 15
44
seconds to generate scores of roughly equal quality as the GA.
45
46
47
2. Compiling a Corpus
48
49
The first step to generating your own scores it to start collecing
50
mail, both ham (non-spam) and spam. These should be representative of
51
all the mail you receive, but you should filter out spam related
52
lists, like spamassassin-users to avoid skewing results. It is
53
essential that these corpora be very well classified. It will greatly
54
reduce the effectiveness of your scores if spam mails get misfiled
55
into your ham folder and vice versa.
56
57
Also, it is important to note that SpamAssassin is not designed to be
58
a virus filter, so it's best if you filter out viurses from your ham
59
and spam folders too.
60
61
Furthermore, since spam and ham characteristics change over time, it's
62
best to leave out mail over 6 months. This is especially important for
63
network tests, since these are designed to stop current spam, and are
64
not historical records.
65
66
I'm not entirely sure how big corpora should be. The bigger, the
67
better. If your corpus is too small, it may not be sufficiently
68
representative of all the mail you receive, and accuracy will
69
suffer. My corpus of mail for the last 6 months is over 55000 messages
70
(35000 spam, 20000 ham).
71
72
73
3. Mass-check
74
75
Now that you've assembled your corpora, you need to use mass-check to
76
test each message with SpamAssassin. This script is surprisingly fast,
77
as it accesses the internal perl libraries of SpamAssassin, without
78
the need to load a new perl process each time (as you would if you
79
piped each message through spamassassin). Doing a scoreset 2 run (no
80
network, bayes enabled) I get roughly 10,000 messages an hour on an
81
unloaded Pentium 4, 2.80Ghz computer with 512 MB RAM.
82
83
By default, if you are not running out of an unpacked source tree,
84
mass-check will read rules from the usual locations. As a result, you
85
should make sure ~/.spamassassin/user_prefs contains no rules, unless
86
you are planning on using your generated scores for only yourself, not
87
sitewide.
88
89
The first step is to define the locations of all of the messages in
90
your corpora (these are known as "targets"). I find it's easiest to
91
put this in a separate file with line of the following format:
92
93
class:format:location
94
95
Class is either "spam" or "ham", format is "mbox", "file", "dir" or
96
"mbx" and location is the path to the mailbox. mass-check supports
97
using * as a wildcard, so the following target is permitted:
98
99
spam:mbox:/home/duncf/Maildir/Old/spam/*
100
101
Once you have placed all the "targets" necessary for your corpora, run
102
mass-check with the following command.
103
104
mass-check -f file
105
106
If you doing a mass-check run for scoreset 1 or 3 (i.e. network tests
107
enabled) you will also need to add the --net option, and you will want
108
to add -j8 (or some other number) to indicate how many messages to
109
test in parallel. This is useful since a lot of time would otherwise
110
be spent waiting for network queries to return.
111
112
mass-check will generate a log file in the current directory entitled
113
masses.log. This is the log file that will enable us to optimize
114
scores.
115
116
For the impatient: if you're one of those people who want to know
117
exactly how far mass-check has gotten through your mail, use the
118
--showdots option.
119
120
121
4. Checking the quality of your corpora (a.k.a. Pulling Weeds)
122
123
In order to ensure that your corpora don't contain misfiled mails, it
124
is good to double check the highest scoring hams and lowest scoring
125
spams.
126
127
First check ham mail:
128
129
grep "^h" masses.log | sort -rn -k2,2 | head -20
130
131
If you want to read the corresponding messages try piping to
132
extract-message-from-mbox -m (see the extract-message-from-mbox
133
section for more detail).
134
135
Do the same with spam mail:
136
137
grep "^s" masses.log | sort -n -k2,2 | head -20
138
139
140
5. extract-message-from-mbox
141
142
extract-message-from-mbox takes a mbox filename and a byte offset and
143
outputs the corresponding mail message. With the -m option, mass-check
144
output (i.e. lines from masses.log) is read from the standard
145
input. Without, arguments are expected to be in the form
146
<mbox>.<offset> (i.e. /path/to/mbox.12345)
147
148
The -h option can also be used to only show message headers.
149
150
As shown above, it is quite useful to pipe portions of masses.log to
151
extract-message-from-mbox.
152
153
154
6. hit-frequencies
155
156
hit-frequencies doesn't really help you advance toward your goal of
157
optimizing scores, but it is very useful in evaluating locally created
158
rules. Run it, look at it's output; you'll find it intersting (and if
159
not, feel free to skip to the next section).
160
161
hit-frequencies -x -p -s <scoreset>
162
163
hit-frequencies (and many other scripts) are set to automatically
164
guess where to find your configuration files based on
165
masses.log. Unfortunately, it isn't perfect (actually it's a rather
166
crude hack, but that's irrelevant). You may have to check masses.log
167
to figure out where it's searching and/or add --cffile options (you
168
can specify multiple paths using multiple --cffile options).
169
170
hit-frequencies -x -p generates the following output:
171
172
OVERALL%   SPAM%     HAM%     S/O    RANK   SCORE  NAME
173
  64008    40932    23076    0.639   0.00    0.00  (all messages)
174
100.000  63.9483  36.0517    0.639   0.00    0.00  (all messages as %)
175
 10.382  16.2342   0.0000    1.000   1.00    3.10  FORGED_MUA_OUTLOOK
176
  8.266  12.9263   0.0000    1.000   0.99    1.00  FORGED_OUTLOOK_TAGS
177
  6.484  10.1388   0.0000    1.000   0.98    4.50  DRUGS_ERECTILE_OBFU
178
[...]
179
180
The first two rows show the size of the corpora and their ham/spam
181
break down. The following lines list each rule found and give various
182
statistics about it based on your masses.log.
183
184
OVERALL% represents the percentage of total messages (spam and ham)
185
that the rule hits, SPAM% and HAM% show the percentages on each
186
corpus. S/O is the SPAM% divided by the OVERALL%. Generally good
187
(non-nice) rules have S/O's over 0.95, while nice (negative-scoring)
188
rules generally have S/O's less than 0.5. RANK is a human readable
189
indicator of how good a rule is. The higher the better, always. RANK
190
is designed to be a rough indicator of the score the perceptron is
191
likely to give it. SCORE is simply the current score. (This is simply
192
listed for convenience, not calculated in any way.)
193
194
If you do any rule development locally, you will find this is a great
195
tool. If you come up with some great rules (that we haven't already
196
thought of), please send us a patch at
197
http://bugzilla.spamassassin.org/.
198
199
200
7. lint-rules-from-freqs
201
202
This script is designed to read in your masses.log and the
203
SpamAssassin configuration files in order to find both bad syntax and
204
bad rules that hit few messages or (with -f) have too many false
205
positives/negatives, etc.
206
207
lint-rules-from-freqs -f -s <scoreset>
208
209
As with hit-frequencies, it tries to be smart with choosing the right
210
--cffile options.
211
212
This script is roughly the equivalent of running a spamassassin --lint
213
and running a hit-frequencies to determine which tests have bad S/O
214
ratios.
215
216
217
8. logs-to-c
218
219
logs-to-c is the program that converts a mass-check log into code that
220
can be easily used by the perceptron. Currently, it is necessary to
221
use the output of logs-to-c to even compile perceptron, but that
222
should hopefully change in the near future.
223
224
The files logs-to-c create need to be in the tmp/ sub-directory of the
225
directory where perceptron.c is.
226
227
logs-to-c -o tmp/ -s <scoreset>
228
229
These files contain information about each rule such as whether or not
230
the perceptron is permitted to change the rule's score, the range
231
within which the perceptron can adjust it, whether or not a rule is
232
nice, etc. In addition, these files contain information about each
233
mail hit and which tests were hit. The files generated by logs-to-c
234
are not really easy to read, so don't try; use hit-frequencies
235
instead.
236
237
238
9. perceptron
239
240
perceptron is the brains behind the whole process. (And we must of
241
course thank the brain behind perceptron, Henry Stern, for his
242
contribution.)
243
244
While the perceptron takes options for things such as "ham
245
preference", "number of epochs", "learning rate" and "weight decay",
246
it's probably best to trust the defaults; unless of course you want to
247
try to find the optimum parameters (and post them to
248
http://bugzilla.spamassassin.org/ with your evidence).
249
250
The perceptron is incredibly quick. So start it, wait 15 seconds and
251
voila, your optimized scores are ready. The output is in
252
perceptron.scores.
253
254
Unfortunately, it needs to be built from source every time you want to
255
use it with a different masses.log or set of rules. In the directory
256
containing perceptron.c, try:
257
258
make perceptron
259
./perceptron
260
261
If you don't have the Makefile, try
262
gcc -g -O2 -Wall -o perceptron perceptron.c -lm
263
./perceptron
264
265
266
10. rewrite-cf-with-new-scores
267
268
perceptron dumps its results in perceptron.scores. Great. How does
269
that help you? rewrite-cf-with-new-scores takes care of changing the
270
old configuration files to correspons with the new scores. The script
271
takes into account rules found in your configuration, so make sure
272
that the --cffile argument is right (it'll read this from masses.log
273
by default). The syntax is:
274
275
rewrite-cf-with-new-scores --old 50_scores.cf --new perceptron.scores \
276
  --out 50_scores.new.cf -l masses.log -s 2
277
278
Make sure you don't forget the -s option. You need to tell it which
279
scoreset to update or it'll update set 0, which is not what you want
280
(unless you just did a set 0 run, of course).
281
282
Note: the statistics in the new scores file are NOT updated. Just the
283
scores are.
284
285
11. fp-fn-statistics
286
287
This script calculates how good the scores are ata given threshold. It
288
returns the number of false positives, false negatives, true
289
positives, true negatives and a whole variety of fun statistics.
290
291
./fp-fn-statistics -s <scoreset> --cffile <path>
292
293
fp-fn-statistics also generates a TCR which is essentially an overall
294
rating of how good the scores are. (This is only accurate when run on
295
a different corpus of mail than that with which the scores were
296
generated). TCR stands for "Total Cost Ratio". The higher the number,
297
the better the set of scores.
298
299
300
12. Submitting corpora for SpamAssassin
301
302
If you want to contribute your mass-check logs to the SpamAssassin
303
rescoring process, please download the latest revision of SpamAssassin
304
from the subversion repository. See this page of the wiki:
305
http://wiki.spamassassin.org/DownloadFromSvn
306
307
You will want to read CORPUS_POLICY and CORPUS_SUBMIT. We only do
308
large rescoring runs just before releases, so be sure to follow the
309
lists which will have more information and reminders on how to
310
participate.
311
312
Please be sure your corpora are of high quality (everything must be
313
carefully checked to avoid misfilings). Also, we appreciate varied
314
sources of mail.
315
316
317
13. Other scripts
318
319
Only a subset of the scripts used in rule development and scoring have
320
been documented here. Most of the others aren't really very
321
useful. You can examine the others by downloading the source from the
322
subversion repository: http://wiki.spamassassin.org/DownloadFromSvn.
323
Everything relating to rule QA and development is in the masses/
324
sub-directory.
325
326
The scripts presented here have had man pages written for them, and an
327
attempt has been made to standardize the options for ease of use. Many
328
of the others may require some reading of source to understand how
329
they work and what they do.
330
331
332
14. Frequently Asked Questions
333
334
(Since this is the first version of this document, I'm guessing what
335
questions would otherwise be asked. So this isn't really a "Frequently
336
Asked Questions" list, but a "What did Duncan fail to address
337
elsewhere?" list.)
338
339
Q. Why don't the scripts automatically guess which scoreset to use like
340
they do with --cffile?
341
342
A. Firstly, mass-check does not know what scoreset
343
you are running. It could guess, but it probably shouldn't. Secondly,
344
the same masses.log can be used for multiple scoresets (a set 1 log
345
can be used to generate scores for sets 0 and 1, by stripping out net
346
rules etc.)
347
348
Q. How can I determine how good the scoring system is?
349
350
A. There is a series of scripts in the source directory (in
351
masses/tenpass/) designed to determine how accurate the perceptron is
352
by using "10-fold Cross Validation" (10fcv). Basically, the masses.log
353
is split into 10 "buckets" and each bucket is sequentially used to
354
validate against scores generated from the remaining 9.
355
356
357
15. Bugs, author, improvements, etc.
358
359
SpamAssassin is written and maintained by a group of developers, whose
360
names can be found in the CREDITS file.
361
362
If you have further questions about SpamAssassin or the rescoring
363
scripts, try the following:
364
365
- Ask on one of the SpamAssassin mailing lists:
366
367
http://www.spamassassin.org/lists.html
368
369
- If you've found a bug, file a report:
370
371
http://bugzilla.spamassassin.org/
372
373
- Also, check out our wiki:
374
375
http://wiki.spamassassin.org/
(-)masses/runGA (-24 / +21 lines)
Lines 1-47 Link Here
1
#!/bin/sh
1
#!/bin/sh
2
2
3
SCORESET="0"
3
SCORESET="0"
4
if [ "x$1" != "x" ] ; then
5
    SCORESET=$1
6
fi
7
4
NAME="set$SCORESET"
8
NAME="set$SCORESET"
9
BASE="logs"
5
10
6
if [ ! -f "ORIG/ham-$NAME.log" -o ! -f "ORIG/spam-$NAME.log" ]; then
11
if [ ! -f "ORIG/masses-$NAME.log" ]; then
7
	echo "Couldn't find logs for $NAME" >&2
12
	echo "Couldn't find logs for $NAME" >&2
8
	exit 1
13
	exit 1
9
fi
14
fi
10
15
11
if [ "x$1" = "x" ]; then
16
if [ "x$2" = "x" ]; then
12
echo "[Doing a scoreset $SCORESET score-generation run]"
17
echo "[Doing a scoreset $SCORESET score-generation run]"
13
18
14
# Clean out old runs
19
# Clean out old runs
15
echo "[Cleaning up]"
20
echo "[Cleaning up]"
16
rm -rf spam-validate.log nonspam-validate.log ham-validate.log spam.log nonspam.log ham.log NSBASE SPBASE tmp make.output freqs perceptron.scores \
21
17
	gen-$NAME.out gen-$NAME.scores gen-$NAME.validate
22
rm -rf masses-validate.log masses.log $BASE tmp make.output freqs \
23
    perceptron.scores gen-$NAME.out gen-$NAME.scores gen-$NAME.validate
18
make clean >/dev/null
24
make clean >/dev/null
19
25
20
# Generate 90/10 split logs
26
# Generate 90/10 split logs
21
echo "[Generating 90/10 split ham]"
27
echo "[Generating 90/10 split ham]"
22
mkdir NSBASE SPBASE
28
mkdir $BASE
23
cd NSBASE
29
cd $BASE
24
../tenpass/split-log-into-buckets 10 < ../ORIG/ham-$NAME.log > /dev/null
30
../tenpass/split-log-into-buckets 10 < ../ORIG/masses-$NAME.log > /dev/null
25
cat split-[1-9].log > nonspam.log
31
cat split-[1-9].log > masses.log
26
rm -f split-[1-9].log
32
rm -f split-[1-9].log
27
mv split-10.log nonspam-validate.log
33
mv split-10.log masses-validate.log
28
34
29
echo "[Generating 90/10 split spam]"
30
cd ../SPBASE
31
../tenpass/split-log-into-buckets 10 < ../ORIG/spam-$NAME.log > /dev/null
32
cat split-[1-9].log > spam.log
33
rm -f split-[1-9].log
34
mv split-10.log spam-validate.log
35
cd ..
35
cd ..
36
36
37
echo "[Setting up for gen run]"
37
echo "[Setting up for gen run]"
38
# Ok, setup for a run
38
# Ok, setup for a run
39
ln -s SPBASE/spam.log .
39
ln -s $BASE/masses.log .
40
ln -s NSBASE/nonspam.log .
40
ln -s $BASE/masses-validate.log .
41
ln -s NSBASE/nonspam.log ham.log
42
ln -s SPBASE/spam-validate.log .
43
ln -s NSBASE/nonspam-validate.log .
44
ln -s NSBASE/nonspam-validate.log ham-validate.log
45
41
46
# try to find number of processors
42
# try to find number of processors
47
numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1`
43
numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1`
Lines 57-67 Link Here
57
53
58
else
54
else
59
55
56
echo "Make sure 50_scores.cf has been replaced appropriately"
57
60
# This needs to have 50_scores.cf in place first ...
58
# This needs to have 50_scores.cf in place first ...
61
echo "[gen validation results]"
59
echo "[gen validation results]"
62
./logs-to-c --spam=SPBASE/spam-validate.log \
60
./fp-fn-statistics --logfile=BASE/masses-validate.log \
63
	--nonspam=NSBASE/nonspam-validate.log \
61
	--cffile=../rules --scoreset=$SCORESET | tee gen-$NAME.validate
64
	--count --cffile=../rules --scoreset=$SCORESET | tee gen-$NAME.validate
65
62
66
echo "[STATISTICS file generation]"
63
echo "[STATISTICS file generation]"
67
./mk-baseline-results $SCORESET | tee gen-$NAME.statistics
64
./mk-baseline-results $SCORESET | tee gen-$NAME.statistics
(-)masses/lint-rules-from-freqs (-248 / +159 lines)
Lines 16-139 Link Here
16
# limitations under the License.
16
# limitations under the License.
17
# </@LICENSE>
17
# </@LICENSE>
18
18
19
=head1 NAME
20
21
lint-rules-from-freqs - Try to find problems with SpamAssassin rules
22
23
=head1 SYNOPSIS
24
25
lint-rules-from-freqs [options]
26
27
 Options:
28
    -c,--cffile=path	  Use path as the rules directory
29
    -s,--scoreset=n	  Use scoreset n
30
    -l,--logfile=file	  Read in file instead of masses.log
31
    -f			  Also take into account false positives/negatives
32
33
=head1 DESCRIPTION
34
35
This script analyzes SpamAssassin tests, based on the hit frequencies
36
and S/O ratios from a mass-check log (masses.log).  This script can
37
also optionally take into account the false positive/negative
38
frequencies.
39
40
The script first uses the SpamAssassin rules parser to report on any
41
illegal syntax. Then it checks the rules match frequencies from the
42
mass-check log in order to determine how effective the rule is.
43
44
=head1 BUGS
45
46
Please report bugs to http://bugzilla.spamassassin.org/
47
48
=head1 SEE ALSO
49
50
L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
51
52
=cut
53
54
55
use FindBin;
56
use lib "$FindBin::Bin/../lib";
57
use Mail::SpamAssassin::Masses;
58
use Mail::SpamAssassin;
59
use Getopt::Long qw(:config bundling auto_help);
60
use strict;
61
use warnings;
62
19
# any tests that get less than this % of matches on *both* spam or nonspam, are
63
# any tests that get less than this % of matches on *both* spam or nonspam, are
20
# reported.
64
# reported.
21
my $LOW_MATCHES_PERCENT = 0.03;
65
my $LOW_MATCHES_PERCENT = 0.03;
22
my $scoreset = 0;
23
66
24
sub usage {
67
use vars qw($opt_c $opt_l $opt_s $opt_f $opt_p);
25
  die "
26
lint-rules-from-freqs: perform 'lint' testing on SpamAssassin rules and scores
27
68
28
usage: ./lint-rules-from-freqs [-f falsefreqs] < freqs > badtests
69
GetOptions("c|cffile=s@" => \$opt_c,
70
	   "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
71
	   "l|logfile=s" => \$opt_l,
72
	   "f|falses" => \$opt_f);
29
73
30
This analyzes SpamAssassin tests, based on the hit frequencies and S/O ratios
31
from a mass-check logfile pair.
32
74
33
The 'freqs' argument is the frequency of hits in all messages ('hit-frequencies
75
$opt_s = 0 unless defined $opt_s;
34
-x -p' output).
76
$opt_l ||= "masses.log";
35
77
36
The 'falsefreqs' argument is frequencies of hits in false-positives and
78
if (!$opt_c || !scalar(@$opt_c)) {
37
false-negatives only ('hit-frequencies -x -p -f' output).
79
    # Try to read this in from the log, if possible
80
    open IN, $opt_l or die "Can't open $opt_l: $!";
81
    my $files = 0; # are we in the files section?
82
    while(<IN>) {
83
	if (!$files) {
84
	    if (/^\# SVN revision:/) {
85
		$opt_c = [ "$FindBin::Bin/../rules" ];
86
		last;
87
	    } elsif (/^\# Using configuration:$/) {
88
		$files = 1;
89
	    }
90
	} elsif (/^\#\s+(.*)\s*$/) {
91
	    push (@$opt_c, $1);
92
	} else {
93
	    # All done!
94
	    last;
95
	}
96
    }
38
97
39
";
98
    if (!defined $opt_c) {
40
}
99
      $opt_c = [ "$FindBin::Bin/../rules" ];
100
    }
41
101
42
my $opt_falsefreqs;
102
    foreach my $file (@$opt_c) {
43
while ($#ARGV >= 0) {
103
	die "Can't read $file" unless -r $file;
44
  $_ = shift @ARGV;
104
    }
45
  if (/^-f/) { $_ = shift @ARGV; $opt_falsefreqs = $_; }
46
  elsif (/^-s/) { $_ = shift @ARGV; $scoreset = $_; }
47
  else { usage(); }
48
}
105
}
49
106
50
print "BAD TESTS REPORT\n";
107
print "BAD TESTS REPORT\n";
51
readrules();
108
# First, do a --lint
52
print "\n" .((scalar keys %rulefile) + 1). " rules found.\n";
109
53
print "\nRule file syntax issues:\n\n";
110
print "\nRule file syntax issues:\n\n";
54
lintrules();
55
111
56
if ($opt_falsefreqs) {
112
{
57
  open (FALSE, "<$opt_falsefreqs");
113
  local (*STDERR) = \*STDOUT; # Get lint errors on STDOUT
58
  while (<FALSE>) {
114
59
    if (!/^\s*([\d\.]+)/) {
115
  # Read the config ourselves...
60
      my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
116
61
      next unless ($name =~ /\S/);
117
  # Read init.pre from each directory, then glob for the rest.
62
      $falsefreqs_spam{$name} = $spam;
118
63
      $falsefreqs_nons{$name} = $nons;
119
  my $cf_txt = '';
64
      $falsefreqs_so{$name} = $so;
120
  my @files;
121
  my @dirs;
122
  foreach my $file (@$opt_c) {
123
    if (-d $file) {
124
      if  (-r "$file/init.pre") {
125
	push @files, "$file/init.pre";
126
      }
127
      push @dirs, $file;
65
    }
128
    }
129
    else {
130
      push @files, $file;
131
    }
66
  }
132
  }
67
  close FALSE;
133
  foreach my $dir (@dirs) {
68
}
134
    my @cfs = glob("$dir/*.cf");
135
    push @files, grep { -r $_ } @cfs;
136
  }
69
137
70
while (<>) {
138
  foreach my $file (@files) {
71
  if (!/^\s*([\d\.]+)/) {
139
    if (-r $file) {
72
    $output{'a_header'} = $_; next;
140
      open IN, $file;
141
      $cf_txt .= "file start $file\n";
142
      $cf_txt .= join('', <IN>);
143
      $cf_txt .= "\nfile end $file\n";
144
      close IN;
145
    }
73
  }
146
  }
74
147
148
  my $spamtest = new Mail::SpamAssassin({config_text => $cf_txt});
149
150
  $spamtest->lint_rules();
151
}
152
153
154
# Next, check for other stuff
155
my $masses = Mail::SpamAssassin::Masses->new({rulesdir => $opt_c,
156
					      scoreset => $opt_s, #,,
157
					      falses => $opt_f,
158
					      logfile => $opt_l});
159
160
$masses->readlogs();
161
$masses->do_statistics();
162
163
my $rules = $masses->get_rules_array();
164
165
166
my %output;
167
168
foreach my $rule (@$rules) {
169
75
  my $badrule;
170
  my $badrule;
76
  my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
77
  next unless ($name =~ /\S/);
78
171
79
  my $ffspam = $falsefreqs_spam{$name};
172
  next if ($rule->{tflags} =~ /\bnet\b/ && ($opt_s % 2) == 0);
80
  my $ffnons = $falsefreqs_nons{$name};
173
  next if ($rule->{tflags} =~ /\buserconf\b/);
81
  my $ffso = $falsefreqs_so{$name};
82
174
83
  my $tf = $tflags{$name};
175
  if ($rule->{freq_spam} == 0 && $rule->{freq_ham} == 0) {        # sanity!
84
  next if ($tf =~ /net/ && ($scoreset % 2) == 0);
85
  next if ($tf =~ /userconf/);
86
176
87
  if ($overall == 0.0 && $spam == 0.0 && $nons == 0.0) {        # sanity!
88
    $badrule = 'no matches';
177
    $badrule = 'no matches';
89
178
90
  } else {
179
  } else {
91
    if ($score < 0.0) {
180
    if ($rule->{score} < 0.0) {
92
      # negative score with more spams than nonspams? bad rule.
181
      # negative score with more spams than nonspams? bad rule.
93
      if ($tf !~ /nice/ && $so > 0.5 && $score < 0.5) {
182
      if (!$rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) {
94
        $badrule = 'non-nice but -ve score';
183
        $badrule = 'non-nice but -ve score';
95
      }
184
      }
96
185
      if ($rule->{isnice} && $rule->{soratio} > 0.5 && $rule->{score} < 0.5) {
97
      if ($tf =~ /nice/ && $so > 0.5 && $score < 0.5) {
186
        if ($opt_f && $rule->{freq_fn} < $rule->{freq_fp}) {
98
        if ($ffso < 0.5) {
99
          $badrule = 'fn';
187
          $badrule = 'fn';
100
        } else {
101
          # ignore, the FNs are overridden by other tests so it doesn't
102
          # affect the overall results.
103
        }
188
        }
189
        # else {
190
        # ignore, the FNs are overridden by other tests so it doesn't
191
        # affect the overall results.
192
        # }
104
      }
193
      }
105
194
106
      # low number of matches overall
195
      # low number of matches overall
107
      if ($nons < $LOW_MATCHES_PERCENT) 
196
      if ($rule->{ham_percent} < $LOW_MATCHES_PERCENT)
108
                 { $badrule ||= ''; $badrule .= ', low matches'; }
197
                 { $badrule ||= ''; $badrule .= ', low matches'; }
109
198
110
    } elsif ($score > 0.0) {
199
    } elsif ($rule->{score} > 0.0) {
111
      # positive score with more nonspams than spams? bad.
200
      # positive score with more nonspams than spams? bad.
112
      if ($tf =~ /nice/ && $so < 0.5 && $score > 0.5) {
201
      if ($rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) {
113
        $badrule = 'nice but +ve score';
202
        $badrule = 'nice but +ve score';
114
      }
203
      }
115
204
 
116
      if ($tf !~ /nice/ && $so < 0.5 && $score > 0.5) {
205
      if (!$rule->{isnice} && $rule->{soratio} < 0.5 && $rule->{score} > 0.5) {
117
        if ($ffso > 0.5) {
206
        if ($opt_f && $rule->{freq_fp} > $rule->{freq_fn}) {
118
          $badrule = 'fp';
207
          $badrule = 'fp';
119
        } else {
120
          # ignore, the FPs are overridden by other tests so it doesn't
121
          # affect the overall results.
122
        }
208
        }
209
        # else {
210
        # ignore, the FPs are overridden by other tests so it doesn't
211
        # affect the overall results.
212
        # }
123
      }
213
      }
124
214
 
125
      # low number of matches overall
215
      # low number of matches overall
126
      if ($spam < $LOW_MATCHES_PERCENT) 
216
      if ($rule->{spam_percent} < $LOW_MATCHES_PERCENT)
127
                 { $badrule ||= ''; $badrule .= ', low matches'; }
217
                 { $badrule ||= ''; $badrule .= ', low matches'; }
128
218
 
129
    } elsif ($score == 0.0) {
219
    } elsif ($rule->{score} == 0.0) {
130
      $badrule = 'score is 0';
220
      $badrule = 'score is 0';
131
    }
221
    }
132
  }
222
  }
133
223
 
134
  if (defined $badrule) {
224
  if (defined $badrule) {
135
    $badrule =~ s/^, //; chomp;
225
    $badrule =~ s/^, //;
136
    $output{$badrule} .= $_ . " ($badrule)\n";
226
    $output{$badrule} .= $rule->{name} . " ($badrule)\n";
137
  }
227
  }
138
}
228
}
139
229
Lines 156-337 Link Here
156
exit;
246
exit;
157
247
158
248
159
sub concat_rule_lang {
160
  my $rule = shift;
161
  my $lang = shift;
162
163
  if (defined $lang && $lang ne '') {
164
    return "[$lang]_$rule";
165
  } else {
166
    return $rule;
167
  }
168
}
169
170
# note: do not use parse-rules-for-masses here, we need to do linting instead
171
# of your average parse
172
sub readrules {
173
  my @files = <../rules/[0-9]*.cf>;
174
  my $file;
175
  %rulesfound = ();
176
  %langs = ();
177
  foreach $file (@files) {
178
    open (IN, "<$file");
179
    while (<IN>) {
180
      s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;
181
182
      # make all the foo-bar stuff foo_bar
183
      1 while s/^(\S+)-/\1_/g;
184
      1 while s/^(lang\s+\S+\s+\S+)-/\1_/g;
185
186
      my $lang = '';
187
      if (s/^lang\s+(\S+)\s+//) {
188
        $lang = $1; $langs{$1} = 1;
189
      }
190
191
      if (/^(header|rawbody|body|full|uri|meta)\s+(\S+)\s+/) {
192
        $rulesfound{$2} = 1;
193
        $rulefile{$2} ||= $file;
194
        $scorefile{$1} = $file;
195
        $score{$2} ||= 1.0;
196
        $tflags{$2} ||= '';
197
        $descfile{$2} ||= $file;       # a rule with no score or desc is OK
198
	$description{$2}->{$lang} = undef;
199
200
        if (/^body\s+\S+\s+eval:/) {
201
          # ignored
202
        } elsif (/^body\s+\S+\s+(.*)$/) {
203
          my $re = $1;
204
205
	  # If there's a ( in a rule where it should be (?:, flag it.
206
	  # but ignore [abc(] ...
207
          if ($re =~ /[^\\]\([^\?]/ && $re !~ /\[[^\]]*[^\\]\(/) { 
208
            print "warning: non-(?:...) capture in regexp in $file: $_\n";
209
          }
210
          if ($re =~ /\.[\*\+]/) { 
211
            print "warning: .* in regexp in $file: $_\n";
212
          }
213
          if ($re =~ /[^\\]\{(\d*),?(\d*?)\}/) {
214
            if ($1 > 120 || $2 > 120) {
215
              print "warning: long .{n} in regexp in $file: $_\n";
216
            }
217
          }
218
        }
219
220
      } elsif (/^describe\s+(\S+)\s+(.*?)\s*$/) {
221
        $rulesfound{$1} = 1;
222
        $descfile{concat_rule_lang ($1, $lang)} ||= $file;
223
        $descfile{$1} ||= $file;
224
	$description{$1}->{$lang} = $2;
225
      } elsif (/^tflags\s+(\S+)\s+(.+)$/) {
226
        $rulesfound{$1} = 1;
227
        $tflags{$1} = $2;
228
        $tflagsfile{concat_rule_lang ($1, $lang)} = $file;
229
        $tflagsfile{$1} = $file;
230
      } elsif (/^score\s+(\S+)\s+(.+)$/) {
231
        $rulesfound{$1} = 1;
232
        $scorefile{concat_rule_lang ($1, $lang)} = $file;
233
        $scorefile{$1} = $file;
234
        $score{$1} = $2;
235
      } elsif (/^(clear_report_template|clear_spamtrap_template|report|spamtrap|
236
                clear_terse_report_template|terse_report|
237
                required_score|ok_locales|ok_languages|test|lang|
238
                spamphrase|whitelist_from|require_version|
239
		clear_unsafe_report_template|unsafe_report|
240
		(?:bayes_)?auto_learn_threshold_nonspam|(?:bayes_)?auto_learn_threshold_spam|
241
		(?:bayes_)?auto_learn
242
                )/x) {
243
        next;
244
      } else {
245
        print "warning: unknown rule in $file: $_\n";
246
      }
247
    }
248
    close IN;
249
  }
250
  @langsfound = sort keys %langs;
251
  @rulesfound = sort keys %rulesfound;
252
}
253
254
sub lintrules {
255
  my %possible_renames = ();
256
257
  foreach my $rule (@rulesfound) {
258
    my $match = $rule;
259
    $match =~ s/_\d+[^_]+$//gs;    # trim e.g. "_20K"
260
    $match =~ s/[^A-Z]+//gs;    # trim numbers etc.
261
262
    if (defined ($rulefile{$rule}) && $possible_renames{$match} !~ / \Q$rule\E\b/) {
263
      $possible_renames{$match} .= " ".$rule;
264
    }
265
    $possible_rename_matches{$rule} = $match;
266
  }
267
268
  foreach my $lang ('', @langsfound) {
269
    foreach my $baserule (@rulesfound) {
270
      next if ( $baserule =~ /^__/ || $baserule =~ /^T_/ );
271
272
      my $rule = concat_rule_lang ($baserule, $lang);
273
      my $f = $descfile{$rule};
274
      my $warned = '';
275
276
      if (defined $f && !defined ($rulefile{$rule})
277
                && !defined ($rulefile{$baserule}))
278
      {
279
        print "warning: $baserule has description, but no rule: $f\n";
280
        $warned .= ' lamedesc';
281
      }
282
283
	# Check our convention for rule length
284
	if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && length $baserule > 22 ) {
285
	  print "warning: $baserule has a name longer than 22 chars: $f\n";
286
	}
287
 	# Check our convention for rule length
288
	if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && defined $description{$baserule}->{$lang} && length $description{$baserule}->{$lang} > 50 ) {
289
	  print "warning: $baserule has a description longer than 50 chars: $f\n";
290
	}
291
292
      # lang rule trumps normal rule
293
      $f = $rulefile{$rule} || $rulefile{$baserule};
294
      # if the rule exists, and the language/rule description doesn't exist ...
295
      if ( defined $f && !defined $description{$baserule}->{$lang} )
296
      {
297
        print "warning: $baserule exists, ",( $lang ne '' ? "lang $lang, " : "" ),"but has no description: $f\n";
298
        $warned .= ' lamedesc';
299
      }
300
301
302
      $f = $scorefile{$rule};
303
      if (defined $f && !defined ($rulefile{$rule})
304
                && !defined ($rulefile{$baserule}))
305
      {
306
        print "warning: $baserule has score, but no rule: $f\n";
307
        $warned .= ' lamescore';
308
      }
309
310
      my $r = $possible_rename_matches{$rule};
311
      if ($warned ne '' && defined $r) {
312
        my @matches = split (' ', $possible_renames{$r});
313
        if (scalar @matches != 0) {
314
          my $text = '';
315
316
          # now try and figure out "nearby" rules with no description/score
317
          foreach my $baser (@matches) {
318
            my $blang;
319
            if ($descfile{$rule} =~ /text_(\S\S)\./) {
320
              $blang = $1;
321
            }
322
            my $r = concat_rule_lang ($baser, $blang);
323
            #warn "$r $descfile{$r} $descfile{$baser}";
324
            next if ($warned =~ /lamedesc/ && (defined $descfile{$r}));
325
            next if ($warned =~ /lamescore/ && (defined $scorefile{$r}));
326
            $text .= " $baser";
327
          }
328
329
          if ($text ne '') {
330
            print "warning: (possible renamed rule? $text)\n";
331
          }
332
        }
333
      }
334
    }
335
  }
336
}
337
(-)masses/Makefile (-16 / +9 lines)
Lines 3-36 Link Here
3
LDFLAGS=	-lm
3
LDFLAGS=	-lm
4
4
5
# What rule scoreset are we using?
5
# What rule scoreset are we using?
6
SCORESET =	0
6
SCORESET =	3
7
LOGFILE =	masses.log
7
8
8
#### Should be no need to modify below this line
9
#### Should be no need to modify below this line
9
10
10
all: badrules perceptron
11
all: badrules perceptron
11
12
12
perceptron: perceptron.o
13
perceptron: perceptron.o
13
	$(CC) -o perceptron perceptron.o $(LDFLAGS)
14
	$(CC) -o perceptron perceptron.o $(LDFLAGS) 
14
15
15
perceptron.o: tmp/rules.pl tmp/tests.h tmp/scores.h
16
perceptron.o: tmp/tests.h
16
	$(CC) $(CFLAGS) -c -o perceptron.o perceptron.c
17
	$(CC) $(CFLAGS) -c -o perceptron.o perceptron.c
17
18
18
tmp/rules.pl: tmp/.created parse-rules-for-masses
19
tmp/tests.h: tmp/.created logs-to-c
19
	perl parse-rules-for-masses -d ../rules -s $(SCORESET)
20
	perl logs-to-c --scoreset=$(SCORESET) --logfile=$(LOGFILE)
20
21
21
tmp/tests.h: tmp/.created tmp/ranges.data logs-to-c
22
freqs: masses.log
22
	perl logs-to-c --scoreset=$(SCORESET)
23
	perl hit-frequencies -x -p -s $(SCORESET) --logfile=$(LOGFILE) > freqs
23
24
24
tmp/scores.h: tmp/tests.h
25
26
tmp/ranges.data: tmp/.created freqs score-ranges-from-freqs
27
	perl score-ranges-from-freqs ../rules $(SCORESET) < freqs
28
29
freqs: spam.log ham.log
30
	perl hit-frequencies -x -p -s $(SCORESET) > freqs
31
32
badrules: freqs
25
badrules: freqs
33
	perl lint-rules-from-freqs < freqs > badrules
26
	perl lint-rules-from-freqs -s $(SCORESET) --logfile=$(LOGFILE) > badrules
34
27
35
tmp/.created:
28
tmp/.created:
36
	-mkdir tmp
29
	-mkdir tmp
(-)masses/mass-check (-110 / +237 lines)
Lines 16-159 Link Here
16
# limitations under the License.
16
# limitations under the License.
17
# </@LICENSE>
17
# </@LICENSE>
18
18
19
sub usage {
19
=head1 NAME
20
  die <<ENDOFUSAGE;
21
usage: mass-check [options] target ...
22
 
23
  -c=file       set configuration/rules directory
24
  -p=dir        set user-prefs directory
25
  -f=file       read list of targets from <file>
26
  -j=jobs       specify the number of processes to run simultaneously
27
  --net         turn on network checks!
28
  --mid         report Message-ID from each message
29
  --debug       report debugging information
30
  --progress    show progress updates during check
31
  --rewrite=OUT save rewritten message to OUT (default is /tmp/out)
32
  --showdots    print a dot for each scanned message
33
  --rules=RE    Only test rules matching the given regexp RE
34
  --restart=N   restart all of the children after processing N messages
35
  --deencap=RE  Extract SpamAssassin-encapsulated spam mails only if they
36
                were encapsulated by servers matching the regexp RE
37
                (default = extract all SpamAssassin-encapsulated mails)
38
 
39
  log options
40
  -o            write all logs to stdout
41
  --loghits     log the text hit for patterns (useful for debugging)
42
  --loguris	log the URIs found
43
  --hamlog=log  use <log> as ham log ('ham.log' is default)
44
  --spamlog=log use <log> as spam log ('spam.log' is default)
45
 
46
  message selection options
47
  -n            no date sorting or spam/ham interleaving
48
  --after=N     only test mails received after time_t N (negative values
49
                are an offset from current time, e.g. -86400 = last day)
50
                or after date as parsed by Time::ParseDate (e.g. '-6 months')
51
  --before=N    same as --after, except received times are before time_t N
52
  --all         don't skip big messages
53
  --head=N      only check first N ham and N spam (N messages if -n used)
54
  --tail=N      only check last N ham and N spam (N messages if -n used)
55
 
56
  simple target options (implies -o and no ham/spam classification)
57
  --dir         subsequent targets are directories
58
  --file        subsequent targets are files in RFC 822 format
59
  --mbox        subsequent targets are mbox files
60
  --mbx         subsequent targets are mbx files
61
 
62
  Just left over functions we should remove at some point:
63
  --bayes       report score from Bayesian classifier
64
 
65
  non-option arguments are used as target names (mail files and folders),
66
  the target format is: <class>:<format>:<location>
67
  <class>       is "spam" or "ham"
68
  <format>      is "dir", "file", "mbx", or "mbox"
69
  <location>    is a file or directory name.  globbing of ~ and * is supported
70
20
71
ENDOFUSAGE
21
mass-check - Generates SpamAssassin scores and results for large
72
}
22
amounts of mail
73
23
24
=head1 SYNOPSIS
25
26
 mass-check [options] class:format:location ...
27
 mass-check [options] {--dir | --file | --mbox} target ...
28
 mass-check [options] -f file
29
30
  Options:
31
    -f=file       read list of targets from <file>
32
    -j=jobs       specify the number of processes to run simultaneously
33
    --net         turn on network checks!
34
    --mid         report Message-ID from each message
35
    --debug       report debugging information
36
    --progress    show progress updates during check
37
    --rewrite=OUT save rewritten message to OUT (default is /tmp/out)
38
    --showdots    print a dot for each scanned message
39
    --rules=RE    Only test rules matching the given regexp RE
40
    --restart=N   restart all of the children after processing N messages
41
42
    SpamAssassin options
43
    -c=dir        set configuration/rules directory
44
    -p=file       set user preferences file (default: none)
45
    -s=dir        set site rules configuration directory
46
    -u=dir        set user-state directory
47
    --dist        assumes the script is being run from the masses/ dir of
48
                  the unpacked tarball, and makes appropriate guesses for
49
                  -p and -c
50
    --deencap=RE  Extract SpamAssassin-encapsulated spam mails only if they
51
                  were encapsulated by servers matching the regexp RE
52
                  (default = extract all SpamAssassin-encapsulated mails)
53
54
    log options
55
    -o            write all logs to stdout
56
    --loghits     log the text hit for patterns (useful for debugging)
57
    --loguris	  log the URIs found
58
    --log=file    log to <file> (masses.log is default)
59
60
    message selection options
61
    -n            no date sorting or spam/ham interleaving
62
    --after=N     only test mails received after time_t N (negative values
63
                  are an offset from current time, e.g. -86400 = last day)
64
                  or after date as parsed by Time::ParseDate (e.g. '-6 months')
65
    --before=N    same as --after, except received times are before time_t N
66
    --all         don't skip big messages
67
    --head=N      only check first N ham and N spam (N messages if -n used)
68
    --tail=N      only check last N ham and N spam (N messages if -n used)
69
70
    simple target options (implies -o and no ham/spam classification)
71
    --dir         subsequent targets are directories
72
    --file        subsequent targets are files in RFC 822 format
73
    --mbox        subsequent targets are mbox files
74
    --mbx         subsequent targets are mbx files
75
76
    Just left over functions we should remove at some point:
77
    --bayes       report score from Bayesian classifier
78
    --hamlog=log  use <log> as ham log ('ham.log' is default)
79
    --spamlog=log use <log> as spam log ('spam.log' is default)
80
81
=head1 DESCRIPTION
82
83
B<mass-check> is designed to assist with rule development and
84
generation of SpamAssassin scored. It reads in mail from the
85
location(s) specified on the command line (in the first form above),
86
given in the form I<class:format:location>, where I<class> is either
87
"spam" or "ham" (non-spam), I<format> is one of "dir" (Maildirs, MH,
88
etc), "file", "mbox" (mboxes can be gzipped) or "mbx".
89
90
B<mass-check> will analyze each message using SpamAssassin and
91
generate one-line of output per message, (by default to masses.log) in
92
the following format:
93
94
 {s|h} {s|h} score filename tests-hit
95
96
The first field is the message's class as given on the command line
97
(ham or spam). The second is the message's class as determined by
98
SpamAssassin. The third is the message's score, as determined by
99
SpamAssassin. The fourth field contains the message's filename; for
100
mboxes, this contains the filename and the byte offset from the
101
beginning of the file separated by a period. The last field contains a
102
list of all the tests the message hit separated by commas.
103
104
If you want to run this on the currently installed version of
105
SpamAssassin's rules for sitewide use, make sure your user_prefs file
106
contains no rules.
107
108
=head1 BUGS
109
110
Please report bugs to http://bugzilla.spamassassin.org/
111
112
=head1 SEE ALSO
113
114
L<hit-frequencies(1)>, L<logs-to-c(1)>, L<Mail::SpamAssassin::Masses(3)>,
115
L<perceptron(1)>
116
117
=cut
118
74
###########################################################################
119
###########################################################################
75
120
76
use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all $opt_bayes
121
use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all
77
	    $opt_debug $opt_format $opt_hamlog $opt_head $opt_loghits
122
	    $opt_bayes $opt_before $opt_debug $opt_dist $opt_format
78
	    $opt_mid $opt_mh $opt_ms $opt_net $opt_nosort $opt_progress
123
	    $opt_hamlog $opt_head $opt_log $opt_loghits $opt_mid
79
	    $opt_showdots $opt_spamlog $opt_tail $opt_rules $opt_restart
124
	    $opt_mh $opt_ms $opt_net $opt_nosort $opt_p $opt_progress
80
	    $opt_loguris $opt_after $opt_before $opt_rewrite $opt_deencap);
125
	    $opt_s $opt_showdots $opt_spamlog $opt_tail $opt_rules
126
	    $opt_restart $opt_loguris $opt_after $opt_rewrite $opt_u
127
	    $opt_deencap);
81
128
82
use FindBin;
129
use FindBin;
83
use lib "$FindBin::Bin/../lib";
130
use lib "$FindBin::Bin/../lib";
84
eval "use bytes";
131
eval "use bytes";
85
use Mail::SpamAssassin::ArchiveIterator;
132
use Mail::SpamAssassin::ArchiveIterator;
86
use Mail::SpamAssassin;
133
use Mail::SpamAssassin;
87
use Getopt::Long;
134
use Getopt::Long qw(:config bundling auto_help);
135
use Pod::Usage;
88
use POSIX qw(strftime);
136
use POSIX qw(strftime);
89
use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };
137
use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; };
138
use strict; # Why wasn't this on?
90
use Config;
139
use Config;
91
140
92
# default settings
141
# default settings
93
$opt_c = "$FindBin::Bin/../rules";
142
94
$opt_p = "$FindBin::Bin/spamassassin";
95
$opt_j = 1;
143
$opt_j = 1;
96
$opt_net = 0;
144
$opt_net = 0;
97
$opt_hamlog = "ham.log";
145
$opt_log = "masses.log";
98
$opt_spamlog = "spam.log";
99
146
100
GetOptions("c=s", "p=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug",
147
GetOptions("c|cffile=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug",
101
	   "hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net",
148
	   "deencap=s", "dist!", "hamlog=s", "head=i", "log=s",
102
	   "progress", "rewrite:s", "showdots", "spamlog=s", "tail=i",
149
	   "loghits", "mh", "mid", "ms", "net", "p=s", "progress",
103
	   "rules=s", "restart=i", "after=s", "before=s", "loguris", "deencap=s",
150
	   "rewrite:s", "s=s", "showdots", "spamlog=s", "tail=i",
151
	   "rules=s", "restart=i", "u=s", "after=s", "loguris",
104
	   "dir" => sub { $opt_format = "dir"; },
152
	   "dir" => sub { $opt_format = "dir"; },
105
	   "file" => sub { $opt_format = "file"; },
153
	   "file" => sub {$opt_format = "file"; },
106
	   "mbox" => sub { $opt_format = "mbox"; },
154
	   "mbox" => sub { $opt_format = "mbox"; },
107
	   "mbx" => sub { $opt_format = "mbx"; },
155
	   "mbx" => sub { $opt_format = "mbx"; },
108
	   '<>' => \&target) or usage();
156
	   '<>' => \&target);
109
157
158
if ($opt_hamlog || $opt_spamlog) { # Old style logging
159
  $opt_hamlog ||= "ham.log";
160
  $opt_spamlog ||= "spam.log";
161
}
162
163
my @targets;
164
110
if ($opt_f) {
165
if ($opt_f) {
111
  open(F, $opt_f) || die $!;
166
  open(F, $opt_f) || die $!;
112
  push(@targets, map { chomp; $_ } <F>);
167
  push(@targets, map { chomp; $_ } <F>);
113
  close(F);
168
  close(F);
114
}
169
}
115
170
116
if (scalar @targets == 0) { usage(); }
171
if (scalar @targets == 0) { pod2usage("No target defined!"); }
117
172
118
#if ($opt_ms) {
173
# Auto-detect --dist option
119
#find_missed($opt_spamlog);
174
if (!defined $opt_dist) {
120
#}
175
  if (-f "$FindBin::Bin/../spamassassin.raw") {
121
#elsif ($opt_mh) {
176
    warn "Automatically using --dist. Assuming you are running from the unpacked tarball. Use --no-dist to override.";
122
#find_missed($opt_hamlog);
177
    $opt_dist = 1;
123
#}
178
  }
179
}
124
180
125
$spamtest = new Mail::SpamAssassin ({
181
my $local_rules_dir;
126
  'debug'              			=> $opt_debug,
127
  'rules_filename'     			=> $opt_c,
128
  'userprefs_filename' 			=> "$opt_p/user_prefs",
129
  'site_rules_filename'			=> "$opt_p/local.cf",
130
  'userstate_dir'     			=> "$opt_p",
131
  'save_pattern_hits'  			=> $opt_loghits,
132
  'dont_copy_prefs'   			=> 1,
133
  'local_tests_only'   			=> $opt_net ? 0 : 1,
134
  'only_these_rules'   			=> $opt_rules,
135
  'ignore_safety_expire_timeout'	=> 1,
136
  PREFIX				=> '',
137
  DEF_RULES_DIR        			=> $opt_c,
138
  LOCAL_RULES_DIR      			=> '',
139
});
140
182
183
if ($opt_dist) { # Set defaults
184
  $opt_c ||= "$FindBin::Bin/../rules";
185
  $opt_p ||= "$FindBin::Bin/mass-check.cf";
186
  $opt_u ||= "$FindBin::Bin/spamassassin";
187
  $opt_s ||= "$FindBin::Bin/spamassassin";
188
  $local_rules_dir = '';
189
}
190
else {
191
  if(!$opt_u) {
192
    # Assuming this is OK, since mass-check isnt supported on windows, is it?
193
    # Also, should there be some check to make sure that previous mass-check stuff isn't in there?
194
    # AFAICT, there isn't otherwise....
195
    if ( -d "${ENV{HOME}}/.spamassassin" ) {
196
      $opt_u = "${ENV{HOME}}/.spamassassin/mass-check";
197
      warn "$opt_u already exists -- may contain files that will effect the results" if (-d $opt_u);
198
      mkdir $opt_u, 0700 if (! -d $opt_u);
199
    }
200
  }
201
202
# Leave the rest to SA, we'll get it afterwards
203
204
}
205
206
207
$opt_s =~ s/~/$ENV{HOME}/ if $opt_s;
208
$opt_c =~ s/~/$ENV{HOME}/ if $opt_c;
209
$opt_p =~ s/~/$ENV{HOME}/ if $opt_p;
210
$opt_u =~ s/~/$ENV{HOME}/ if $opt_u;
211
212
213
my $spamtest = new Mail::SpamAssassin ({
214
				       'debug'              			=> $opt_debug,
215
				       'rules_filename'     			=> $opt_c,
216
				       'userprefs_filename' 			=> $opt_p,
217
				       'site_rules_filename'			=> $opt_s,
218
				       'userstate_dir'     			=> $opt_u,
219
				       'save_pattern_hits'  			=> $opt_loghits,
220
				       'dont_copy_prefs'   			=> 1,
221
				       'local_tests_only'   			=> $opt_net ? 0 : 1,
222
				       'only_these_rules'   			=> $opt_rules,
223
				       'ignore_safety_expire_timeout'	=> 1,
224
				       DEF_RULES_DIR        			=> $opt_c,
225
				       LOCAL_RULES_DIR      			=> $local_rules_dir,
226
				      });
227
141
$spamtest->compile_now(1);
228
$spamtest->compile_now(1);
142
$spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf");
229
if ($opt_dist) {
230
  $spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf");
231
}
143
232
144
my $who   = `id -un 2>/dev/null`;   chomp $who;
233
my $who   = `id -un 2>/dev/null`;   chomp $who;
145
my $where = `uname -n 2>/dev/null`; chomp $where;
234
my $where = `uname -n 2>/dev/null`; chomp $where;
146
my $when  = `date -u`;              chomp $when;
235
my $when  = `date -u`;              chomp $when;
147
my $revision = "unknown";
236
my $revision;
148
if (open(TESTING, "$opt_c/70_testing.cf")) {
237
149
  chomp($revision = <TESTING>);
238
if ($opt_dist) {
150
  $revision =~ s/.*\$Rev:\s*(\S+).*/$1/;
239
  my $rev = "unknown";
151
  close(TESTING);
240
  if (open(TESTING, "$opt_c/70_testing.cf")) {
241
    chomp($rev = <TESTING>);
242
    $rev =~ s/.*\$Rev:\s*(\S+).*/$1/;
243
    close(TESTING);
244
  }
245
  $revision = "SVN revision: $rev";
152
}
246
}
247
else {
248
  $revision = "Local";
249
}
250
153
my $log_header = "# mass-check results from $who\@$where, on $when\n" .
251
my $log_header = "# mass-check results from $who\@$where, on $when\n" .
154
		 "# M:SA version ".$spamtest->Version()."\n" .
252
		 "# M:SA version ".$spamtest->Version()."\n" .
155
		 "# SVN revision: $revision\n" .
253
		 "# $revision\n" .
156
		 "# Perl version: $] on $Config{archname}\n";
254
		 "# Perl version: $] on $Config{archname}\n";
255
256
if (!$opt_dist) {
257
  my @paths = ( $spamtest->{rules_filename}, $spamtest->{site_rules_filename}, $spamtest->{userprefs_filename} );
258
  $log_header .= "# Using configuration:\n";
259
  foreach my $file (@paths) {
260
    $log_header .=  "# $file\n";
261
  }
262
}
263
157
my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || `hostname` || 'localhost';
264
my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || `hostname` || 'localhost';
158
chomp $host;
265
chomp $host;
159
266
Lines 222-228 Link Here
222
    autoflush STDOUT 1;
329
    autoflush STDOUT 1;
223
    print STDOUT $log_header;
330
    print STDOUT $log_header;
224
  }
331
  }
225
  else {
332
  elsif ($opt_hamlog || $opt_spamlog) {
226
    open(HAM, "> $opt_hamlog");
333
    open(HAM, "> $opt_hamlog");
227
    open(SPAM, "> $opt_spamlog");
334
    open(SPAM, "> $opt_spamlog");
228
    autoflush HAM 1;
335
    autoflush HAM 1;
Lines 230-235 Link Here
230
    print HAM $log_header;
337
    print HAM $log_header;
231
    print SPAM $log_header;
338
    print SPAM $log_header;
232
  }
339
  }
340
  else {
341
    open(OUT, "> $opt_log");
342
    autoflush OUT 1;
343
    print OUT $log_header;
344
  }
233
  $init_results = 1;
345
  $init_results = 1;
234
}
346
}
235
347
Lines 239-263 Link Here
239
  # don't open results files until we get here to avoid overwriting files
351
  # don't open results files until we get here to avoid overwriting files
240
  &init_results if !$init_results;
352
  &init_results if !$init_results;
241
353
242
  if ($class eq "s") {
354
  if ($opt_o) {
243
    if ($opt_o) { print STDOUT $result; } else { print SPAM $result; }
355
    print STDOUT $result;
244
    $spam_count++;
245
  }
356
  }
246
  elsif ($class eq "h") {
357
  elsif ($opt_spamlog || $opt_hamlog) {
247
    if ($opt_o) { print STDOUT $result; } else { print HAM $result; }
358
    if ($class eq "s") {
248
    $ham_count++;
359
      print SPAM $result;
360
    } else {
361
      print HAM $result;
362
    }
249
  }
363
  }
364
  else {
365
    print OUT $result;
366
  }
250
367
251
  $total_count++;
368
  $total_count++;
252
#warn ">> result: $total_count $class $time\n";
369
#warn ">> result: $total_count $class $time\n";
253
370
254
  if ($opt_progress) {
371
  if ($opt_progress) {
372
    if ($class eq "s") {
373
      $spam_count++;
374
    }
375
    else {
376
      $ham_count++;
377
    }
255
    progress($time);
378
    progress($time);
256
  }
379
  }
257
}
380
}
258
381
259
sub wanted {
382
sub wanted {
260
  my (undef, $id, $time, $dataref) = @_;
383
  my ($class, $id, $time, $dataref) = @_;
261
  my $out;
384
  my $out;
262
385
263
  my $ma = $spamtest->parse($dataref, 1);
386
  my $ma = $spamtest->parse($dataref, 1);
Lines 308-325 Link Here
308
    push(@extra, "mid=$mid");
431
    push(@extra, "mid=$mid");
309
  }
432
  }
310
433
311
  my $yorn;
434
  my $result;
312
  my $score;
435
  my $score;
313
  my $tests;
436
  my $tests;
314
  my $extra;
437
  my $extra;
315
438
316
  if ($opt_loguris) {
439
  if ($opt_loguris) {
317
    $yorn = '.';
440
    $result = '.';
318
    $score = 0;
441
    $score = 0;
319
    $tests = join(" ", sort @uris);
442
    $tests = join(" ", sort @uris);
320
    $extra = '';
443
    $extra = '';
321
  } else {
444
  } else {
322
    $yorn = $status->is_spam() ? 'Y' : '.';
445
    if ($status->is_spam()) {
446
      $result = "s";
447
    } else {
448
      $result = "h";
449
    }
323
    $score = $status->get_score();
450
    $score = $status->get_score();
324
    $tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit())));
451
    $tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit())));
325
    $extra = join(",", @extra);
452
    $extra = join(",", @extra);
Lines 333-339 Link Here
333
460
334
  $id =~ s/\s/_/g;
461
  $id =~ s/\s/_/g;
335
462
336
  $out .= sprintf("%s %2d %s %s %s\n", $yorn, $score, $id, $tests, $extra);
463
  $out .= sprintf("%s %s %05.2f %s %s %s\n", $class, $result, $score, $id, $tests, $extra);
337
464
338
  if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) {
465
  if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) {
339
    $out .= logkilled($ma, $id, "possible virus");
466
    $out .= logkilled($ma, $id, "possible virus");
(-)masses/mk-baseline-results (-2 / +2 lines)
Lines 10-16 Link Here
10
echo "Classification success on test corpora, at default threshold:"
10
echo "Classification success on test corpora, at default threshold:"
11
echo
11
echo
12
12
13
./logs-to-c --spam=spam-validate.log --nonspam=nonspam-validate.log --threshold 5 --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
13
./fp-fn-statistics --logfile=masses-validate.log --threshold 5 --scoreset=$SCORESET
14
14
15
echo
15
echo
16
echo "Results on test corpora at various alternative thresholds:"
16
echo "Results on test corpora at various alternative thresholds:"
Lines 18-24 Link Here
18
18
19
# list a wide range of thresholds, so that we can make graphs later ;)
19
# list a wide range of thresholds, so that we can make graphs later ;)
20
for thresh in -4 -3 -2 -1 0 1 2 3 4 4.5 5.5 6 6.5 7 8 9 10 12 15 17 20 ; do
20
for thresh in -4 -3 -2 -1 0 1 2 3 4 4.5 5.5 6 6.5 7 8 9 10 12 15 17 20 ; do
21
  ./logs-to-c --spam=spam-validate.log --nonspam=nonspam-validate.log --threshold $thresh --count --scoreset=$SCORESET | sed -e 's/^Reading.*//' -e '/^$/d'
21
  ./fp-fn-statistics --logfile=masses-validate.log --threshold $thresh --scoreset=$SCORESET
22
  echo
22
  echo
23
done
23
done
24
24
(-)masses/README (-13 / +10 lines)
Lines 33-40 Link Here
33
33
34
See the CORPUS_POLICY file for more details.
34
See the CORPUS_POLICY file for more details.
35
35
36
37
38
HOW TO SUBMIT RESULTS BACK TO US
36
HOW TO SUBMIT RESULTS BACK TO US
39
--------------------------------
37
--------------------------------
40
38
Lines 52-62 Link Here
52
  This script is used to perform "mass checks" of a set of mailboxes, Cyrus
50
  This script is used to perform "mass checks" of a set of mailboxes, Cyrus
53
  folders, and/or MH mail spools.  It generates summary lines like this:
51
  folders, and/or MH mail spools.  It generates summary lines like this:
54
52
55
  Y  7 /home/jm/Mail/Sapm/1382 SUBJ_ALL_CAPS,SUPERLONG_LINE,SUBJ_FULL_OF_8BITS
53
  s s 07.22 /home/jm/Mail/Sapm/1382 SUBJ_ALL_CAPS,SUPERLONG_LINE,SUBJ_FULL_OF_8BITS
56
54
57
  or for mailboxes,
55
  or for mailboxes,
58
56
59
  .  1 /path/to/mbox:<5.1.0.14.2.20011004073932.05f4fd28@localhost> TRACKER_ID,BALANCE_FOR_LONG
57
  h h 01.32 /path/to/mbox:<5.1.0.14.2.20011004073932.05f4fd28@localhost> TRACKER_ID,BALANCE_FOR_LONG
60
58
61
  listing the path to the message or its message ID, its score, and the tests
59
  listing the path to the message or its message ID, its score, and the tests
62
  that triggered on that mail.
60
  that triggered on that mail.
Lines 65-87 Link Here
65
  get good hits with few false positives, etc., and re-score the tests to
63
  get good hits with few false positives, etc., and re-score the tests to
66
  optimise the ratio.
64
  optimise the ratio.
67
65
68
  This script relies on the spamassassin distribution directory living in "..".
66
  If given the --dist option, this script relies on the spamassassin
67
  distribution directory living in "..". If this script is not in the
68
  distribution directory, it will generate logs based on the site-wide
69
  rules, as well as personal rules.
69
70
70
71
logs-to-c :
71
logs-to-c :
72
72
73
  Takes the "spam.log" and "nonspam.log" files and converts them into C
73
  Takes the "masses.log" file and converts them into C source files
74
  source files and simplified data files for use by the C score optimization
74
  and simplified data files for use by the C score optimization
75
  algorithm.  (Called by "make" when you build the perceptron, so generally
75
  algorithm.  (Called by "make" when you build the perceptron, so
76
  you won't need to run it yourself.)
76
  generally you won't need to run it yourself.)
77
77
78
79
hit-frequencies :
78
hit-frequencies :
80
79
81
  Analyses the log files and computes how often each test hits, overall,
80
  Analyses the log files and computes how often each test hits, overall,
82
  for spam mails and for non-spam.
81
  for spam mails and for non-spam.
83
82
84
85
mk-baseline-results :
83
mk-baseline-results :
86
84
87
  Compute results for the baseline scores (read from ../rules/*).  If you
85
  Compute results for the baseline scores (read from ../rules/*).  If you
Lines 91-97 Link Here
91
  It will output statistics on the current ruleset to ../rules/STATISTICS.txt,
89
  It will output statistics on the current ruleset to ../rules/STATISTICS.txt,
92
  suitable for a release build of SpamAssassin.
90
  suitable for a release build of SpamAssassin.
93
91
94
95
perceptron.c :
92
perceptron.c :
96
93
97
  Perceptron learner by Henry Stern.  See "README.perceptron" for details.
94
  Perceptron learner by Henry Stern.  See "README.perceptron" for details.
(-)masses/fp-fn-statistics (-2 / +190 lines)
Lines 1-3 Link Here
1
#!/bin/sh
1
#!/usr/bin/perl -w
2
#
3
# <@LICENSE>
4
# Copyright 2004 Apache Software Foundation
5
#
6
# Licensed under the Apache License, Version 2.0 (the "License");
7
# you may not use this file except in compliance with the License.
8
# You may obtain a copy of the License at
9
#
10
#     http://www.apache.org/licenses/LICENSE-2.0
11
#
12
# Unless required by applicable law or agreed to in writing, software
13
# distributed under the License is distributed on an "AS IS" BASIS,
14
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
# See the License for the specific language governing permissions and
16
# limitations under the License.
17
# </@LICENSE>
2
18
3
exec ./logs-to-c --count $*
19
=head1 NAME
20
21
fp-fn-statistics - Display statistics about the quality of scores
22
23
=head1 SYNOPSIS
24
25
fp-fn-statistics [options]
26
27
  Options: 
28
    -c,--cffile=path	  Use path as the rules directory
29
    -s,--scoreset=n	  Use scoreset n
30
    -l,--logfile=file	  Read in file instead of masses.log
31
    -t,--threshold=n      Use a spam/ham threshold of n (default: 5)
32
    --lambda=n            Use a lambda value of n
33
34
=head1 DESCRIPTION
35
36
B<fp-fn-statistics> first calculates the score each message from a
37
masses.log would have under a new set of scores. It then aggregates
38
the number of messages correctly and incorrectly found as spam and
39
ham, and their average scores.
40
41
In addition, B<fp-fn-statistics> determines the "Total Cost Ratio" as
42
a result of the false positives and negatives mentioned above. This
43
calculation takes into the value of lambda, which represents the cost
44
of recovering a false positive, where 1 indicates a message is tagged
45
only, 9 means the message is mailed back to sender asking for a token
46
(TMDA style) and 999 means a message is delted. The default, 5,
47
represents the message being moved to an infrequently read folder.
48
49
=cut
50
51
use FindBin;
52
use lib "$FindBin::Bin/../lib";
53
use Mail::SpamAssassin::Masses;
54
use Getopt::Long qw(:config bundling auto_help);
55
use Pod::Usage;
56
use strict;
57
use warnings;
58
59
use vars qw{$opt_c $opt_l $opt_s $opt_t $opt_lambda};
60
61
GetOptions("c|cffile=s@" => \$opt_c,
62
	   "l|logfile=s" => \$opt_l,
63
	   "s|scoreset=i" => \$opt_s,
64
           "t|threshold=f" => \$opt_t,
65
           "lambda" => \$opt_lambda);
66
67
$opt_l ||= "masses.log";
68
69
if (!$opt_c || !scalar(@$opt_c)) {
70
    # Try to read this in from the log, if possible
71
    open IN, $opt_l or die "Can't open $opt_l: $!";
72
    my $files = 0; # are we in the files section?
73
    while(<IN>) {
74
	if (!$files) {
75
	    if (/^\# SVN revision:/) {
76
		$opt_c = [ "$FindBin::Bin/../rules" ];
77
		last;
78
	    } elsif (/^\# Using configuration:$/) {
79
		$files = 1;
80
	    }
81
	} elsif (/^\#\s+(.*)\s*$/) {
82
	    push (@$opt_c, $1);
83
	} else {
84
	    # All done!
85
	    last;
86
	}
87
    }
88
89
    if (!defined $opt_c) {
90
      $opt_c = [ "$FindBin::Bin/../rules" ];
91
    }
92
93
    foreach my $file (@$opt_c) {
94
	die "Can't read $file" unless -r $file;
95
    }
96
}
97
98
$opt_t = (defined($opt_t) ? $opt_t : 5);
99
$opt_s ||= 0;
100
$opt_lambda ||= 5;
101
102
my $nybias = 10;
103
104
105
my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
106
                                               scoreset => $opt_s, # ,,
107
                                               logfile => $opt_l});
108
109
$masses->readlogs();
110
111
my $logs = $masses->get_logs();
112
113
my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore) = (0,0,0,0,0,0,0,0);
114
115
my $num_spam = $masses->get_num_spam();
116
my $num_ham = $masses->get_num_ham();
117
my $num_logs = $num_spam + $num_ham;
118
119
my $count = 0;
120
121
my $score;
122
123
foreach my $log (@$logs) {
124
125
  $score = 0;
126
  foreach my $test (@{$log->{tests_hit}}) {
127
128
    next if ($test->{issubrule});
129
    next if (!$test->{score});
130
131
    $score += $test->{score};
132
133
  }
134
135
  if ($score >= $opt_t) {
136
    if ($log->{isspam}) {
137
      $ga_yy++;
138
      $yyscore += $score;
139
    }
140
    else {
141
      $ga_ny++;
142
      $nyscore += $score;
143
    }
144
  } else {
145
    if ($log->{isspam}) {
146
      $ga_yn++;
147
      $ynscore += $score;
148
    }
149
    else {
150
      $ga_nn++;
151
      $nnscore += $score;
152
    }
153
  }
154
}
155
156
$nybias = $nybias * ($num_spam / $num_ham);
157
158
my $fprate = ($ga_ny / $num_logs) * 100.0;
159
my $fnrate = ($ga_yn / $num_logs) * 100.0;
160
161
printf ("\n# SUMMARY for threshold %3.1f:\n", $opt_t);
162
printf "# Correctly non-spam: %6d  %4.2f%%  (%4.2f%% of non-spam corpus)\n", $ga_nn,
163
  ($ga_nn /  $num_logs) * 100.0, ($ga_nn /  $num_ham) * 100.0;
164
printf "# Correctly spam:     %6d  %4.2f%%  (%4.2f%% of spam corpus)\n" , $ga_yy,
165
  ($ga_yy /  $num_logs) * 100.0, ($ga_yy /  $num_spam) * 100.0;
166
printf "# False positives:    %6d  %4.2f%%  (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny,
167
  $fprate, ($ga_ny /  $num_ham) * 100.0, $nyscore*$nybias;
168
printf "# False negatives:    %6d  %4.2f%%  (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn,
169
  $fnrate, ($ga_yn /  $num_spam) * 100.0, $ynscore;
170
171
# convert to the TCR metrics used in the published lit
172
my $nspamspam = $ga_yy;
173
my $nspamlegit = $ga_yn;
174
my $nlegitspam = $ga_ny;
175
my $nlegitlegit = $ga_yn;
176
my $nlegit = $num_ham;
177
my $nspam = $num_spam;
178
179
my $werr = ($opt_lambda * $nlegitspam + $nspamlegit)
180
  / ($opt_lambda * $nlegit + $nspam);
181
182
my $werr_base = $nspam
183
  / ($opt_lambda * $nlegit + $nspam);
184
185
$werr ||= 0.000001;     # avoid / by 0
186
my $tcr = $werr_base / $werr;
187
188
my $sr = ($nspamspam / $nspam) * 100.0;
189
my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
190
printf "# TCR: %3.6f  SpamRecall: %3.3f%%  SpamPrec: %3.3f%%  FP: %3.2f%%  FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate;
191
(-)masses/extract-message-from-mbox (-17 / +48 lines)
Lines 19-48 Link Here
19
use bytes;
19
use bytes;
20
20
21
use vars qw {
21
use vars qw {
22
  $opt_f $opt_h $opt_m $opt_H
22
  $opt_h $opt_m
23
};
23
};
24
24
25
use Getopt::Std;
26
getopts("f:hmH");
27
25
28
sub usage {
26
use Getopt::Long qw(:config bundling auto_help);
29
  die "extract-message-from-mbox [-f=file] [-m] [-H] offset
27
use Pod::Usage;
30
28
31
  Extracts the message starting at offset from file (or stdin). Very
29
GetOptions("m|mass-check" => \$opt_m, "h|H|headers" => \$opt_h);
32
  useful in combination with mass-check logs and mboxes. If the -m
33
  option is used, the input should be in \"mass-check\" format (as
34
  output by mass-check). Use the -H option to just output headers.
35
";
36
}
37
30
38
usage() if($opt_h || (!defined($ARGV[0]) && !$opt_m));
31
=head1 NAME
39
my $offset = $ARGV[0];
40
32
33
extract-message-from-mbox - Extract a message from an mbox
34
35
=head1 SYNOPSIS
36
37
 extract-message-from-mbox [--headers] <mbox>.<offset>
38
 extract-message-from-mbox --mass-check
39
40
 Options:
41
  -h, --headers       Display only message headers
42
  -m, --masscheck     Read mass-check output from stdin
43
44
=head1 DESCRIPTION
45
46
B<extract-message-from-mbox> extracts the message from I<mbox>
47
starting at the byte offset I<offset>. Very useful in combination with
48
mass-check logs and mboxes. If the -m or --mass-check option is used,
49
the input should be in "mass-check" format (as output by
50
mass-check). Use the -H option to just output headers.
51
52
=head1 EXAMPLES
53
54
To show messages that hit the rule BAYES_99
55
56
grep BAYES_99 masses.log | extract-message-from-mbox -m
57
58
To show the message indicated by "/path/to/my/mbox.1234"
59
60
extract-message-from-mbox /path/to/my/mbox.1234
61
62
=cut
63
64
65
41
if($opt_m) {
66
if($opt_m) {
42
  masscheck();
67
  masscheck();
43
} else {
68
} else {
44
  $opt_f ||= '&STDIN';
69
  foreach my $message (@ARGV) {
45
  extract($opt_f, $offset);
70
    if ($message =~ /^(.*?)(?:\.(\d+))?$/) {
71
      extract($1, ($2 || 0));
72
    }
73
    else {
74
      pod2usage("Argument must be of the form <mbox>.<offset>");
75
    }
76
  }
46
}
77
}
47
78
48
sub extract {
79
sub extract {
Lines 61-74 Link Here
61
      $found++ if(/^From /);
92
      $found++ if(/^From /);
62
      last if($found == 3);
93
      last if($found == 3);
63
      print;
94
      print;
64
      last if ($opt_H && /^$/) # empty line? end of headers
95
      last if ($opt_h && /^$/) # empty line? end of headers
65
    }
96
    }
66
  }
97
  }
67
}
98
}
68
99
69
sub masscheck {
100
sub masscheck {
70
  while (<STDIN>) {
101
  while (<STDIN>) {
71
    my $mail = (split(/\s+/, $_))[2];
102
    my $mail = (split(/\s+/, $_))[3];
72
    $mail =~ tr/_/ /;
103
    $mail =~ tr/_/ /;
73
    if ($mail =~ /^(.*)\.(\d+)$/) {
104
    if ($mail =~ /^(.*)\.(\d+)$/) {
74
      extract($1, $2);
105
      extract($1, $2);
(-)masses/logs-to-c (-346 / +219 lines)
Lines 16-272 Link Here
16
# limitations under the License.
16
# limitations under the License.
17
# </@LICENSE>
17
# </@LICENSE>
18
18
19
use Getopt::Long;
19
=head1 NAME
20
use vars qw($opt_cffile $opt_count $opt_lambda $opt_threshold
21
		$opt_spam $opt_nonspam);
22
20
23
GetOptions("cffile=s", "count", "lambda=f", "threshold=f", "spam=s", "nonspam=s", "scoreset=i");
21
logs-to-c - Convert a mass-check log into perceptron format
24
my $argcffile = $opt_cffile;
25
22
26
my $justcount = 0;
23
=head1 SYNOPSIS
27
if ($opt_count) { $justcount = 1; }
28
24
29
my $threshold = 5;
25
logs-to-c [options]
30
if (defined $opt_threshold) { $threshold = $opt_threshold; }
31
26
32
$opt_spam ||= 'spam.log';
27
 Options:
33
$opt_nonspam ||= 'ham.log';
28
    -c,--cffile=path	  Use path as the rules directory
34
$opt_scoreset = 0 if ( !defined $opt_scoreset );
29
    -s,--scoreset=n	  Use scoreset n
30
    -l,--logfile=file	  Read in file instead of masses.log
31
    -o,--outputdir        Put output in the specified dir (default tmp/)
35
32
36
my $nybias = 10;
33
=head1 DESCRIPTION
37
34
38
# lambda value for TCR equation, indicating the "cost" of recovering
35
B<logs-to-c> will read the mass-check log F<masses.log> or as
39
# from an FP.  The values are: 1 = tagged only, 9 = mailed back to
36
specified by the B<--logfile> option, and convert it into the format
40
# sender asking for token (TMDA style), 999 = deleted outright.
37
needed by the perceptron. This is a format that is simple for the
41
# We (SpamAssassin) use a default of 5, representing "moved to
38
perceptron to parse, but is not very readable to humans.
42
# infrequently-read folder".
43
39
44
my $lambda = 5;
40
By default, output will be put in the directory ./tmp/ unless another
45
if ($opt_lambda) { $lambda = $opt_lambda; }
41
directory is specified by the B<--outputdir> option. (Note: at the
42
current time, this must be /tmp/ in order for the perceptron to
43
compile properly.)
46
44
47
my %is_spam = ();
45
=head1 BUGS
48
my %tests_hit = ();
49
my %mutable_tests = ();
50
46
51
use vars qw(%rules %allrules);
47
Please report bugs to http://bugzilla.spamassassin.org/
52
48
53
readscores();
49
=head1 SEE ALSO
54
50
55
print "Reading per-message hit stat logs and scores...\n";
51
L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)>
56
my ($num_tests, $num_spam, $num_nonspam);
57
my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore);
58
52
59
readlogs();
53
=cut
60
read_ranges();
61
54
62
if ($justcount) {
55
use FindBin;
63
  $nybias = $nybias*($num_spam / $num_nonspam);
56
use lib "$FindBin::Bin/../lib";
64
  evaluate();
57
use Mail::SpamAssassin::Masses;
65
} else {
58
use Getopt::Long qw(:config bundling auto_help);
66
  print "Writing logs and current scores as C code...\n";
59
use Pod::Usage;
67
  writescores_c();
60
use strict;
68
}
61
use warnings;
69
exit 0;
70
62
63
use vars qw{$opt_c $opt_l $opt_s $opt_o};
71
64
72
sub readlogs {
65
GetOptions("c|cffile=s@" => \$opt_c,
73
  my $count = 0;
66
	   "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
74
  $num_spam = $num_nonspam = 0;
67
	   "l|logfile=s" => \$opt_l,
68
	   "o|output=s" => \$opt_o);
75
69
76
  if ($justcount) {
77
    $ga_yy = $ga_ny = $ga_yn = $ga_nn = 0;
78
    $yyscore = $ynscore = $nyscore = $nnscore = 0.0;
79
  }
80
70
81
  foreach my $file ($opt_spam, $opt_nonspam) {
71
$opt_o ||= "./tmp/";
82
    open (IN, "<$file");
72
if (!-d $opt_o) {
73
  mkdir $opt_o, 0777 or die "Can't mkdir $opt_o";
74
}
83
75
84
    while (<IN>) {
76
$opt_l ||= "masses.log";
85
      next if /^\#/;
86
      next if /^$/;
87
      if($_ !~ /^.\s+([-\d]+)\s+\S+\s*/) { warn "bad line: $_"; next; }
88
      my $hits = $1;
89
#my $foo = $_;
90
      $_ = $'; s/(?:bayes|time)=\S+//; s/,,+/,/g; s/^\s+//; s/\s+$//;
91
77
92
      my $score = 0;
78
if (!$opt_c || !scalar(@$opt_c)) {
93
      my @tests = ();
79
    # Try to read this in from the log, if possible
94
      foreach my $tst (split (/,/, $_)) {
80
    open IN, $opt_l or die "Can't open $opt_l: $!";
95
	next if ($tst eq '');
81
    my $files = 0; # are we in the files section?
96
	if (!defined $scores{$tst}) {
82
    while(<IN>) {
97
          #warn "unknown test in $file, ignored: $tst\n";
83
	if (!$files) {
98
	  next;
84
	    if (/^\# SVN revision:/) {
85
		$opt_c = [ "$FindBin::Bin/../rules" ];
86
		last;
87
	    } elsif (/^\# Using configuration:$/) {
88
		$files = 1;
89
	    }
90
	} elsif (/^\#\s+(.*)\s*$/) {
91
	    push (@$opt_c, $1);
92
	} else {
93
	    # All done!
94
	    last;
99
	}
95
	}
96
    }
100
97
101
	# Make sure to skip any subrules!
98
    if (!defined $opt_c) {
102
	next if ( $allrules{$tst}->{issubrule} );
99
      $opt_c = [ "$FindBin::Bin/../rules" ];
100
    }
103
101
104
        if ($justcount) {
102
    foreach my $file (@$opt_c) {
105
          $score += $scores{$tst};
103
	die "Can't read $file" unless -r $file;
106
        } else {
107
          push (@tests, $tst);
108
        }
109
      }
110
111
      if (!$justcount) { 
112
        $tests_hit{$count} = \@tests;
113
      }
114
115
      if ($file eq $opt_spam) {
116
	$num_spam++;
117
        if ($justcount) {
118
          if ($score >= $threshold) {
119
            $ga_yy++; $yyscore += $score;
120
          } else {
121
            $ga_yn++; $ynscore += $score;
122
          }
123
        } else {
124
          $is_spam{$count} = 1;
125
        }
126
      } else {
127
	$num_nonspam++;
128
        if ($justcount) {
129
          if ($score >= $threshold) {
130
#print "$score -- $foo";
131
            $ga_ny++; $nyscore += $score;
132
          } else {
133
            $ga_nn++; $nnscore += $score;
134
          }
135
        } else {
136
          $is_spam{$count} = 0;
137
        }
138
      }
139
      $count++;
140
    }
104
    }
141
    close IN;
142
  }
143
  $num_tests = $count;
144
}
105
}
145
106
107
# ignore rules that are subrules -- we don't generate scores for them...
146
108
147
sub readscores {
109
# Note: this will cause a difference over the old logs-to-c since rank
148
  if (!defined $argcffile) { $argcffile = "../rules"; }
110
# is dependent on the frequencies of all rules, not just non-subrules
149
  print "Reading scores from \"$argcffile\"...\n";
150
  system ("./parse-rules-for-masses -d \"$argcffile\" -s $opt_scoreset") and die;
151
  require "./tmp/rules.pl";
152
  %allrules = %rules;           # ensure it stays global
153
}
154
111
112
my $greprules = sub { return 0 if $_[1]->{issubrule}; return 1; };
113
114
$opt_s ||= 0; # |
115
116
my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
117
					       scoreset => $opt_s, # ,,
118
					       logfile => $opt_l,
119
                                               greprules => $greprules });
120
121
$masses->readlogs();
122
$masses->do_score_ranges();
123
124
my $rules = $masses->get_rules_array();
125
my $logs = $masses->get_logs();
126
127
my @index_to_rule;
128
my $num_spam = $masses->get_num_spam();
129
my $num_ham = $masses->get_num_ham();
130
131
# This is misleading -- num_tests is really num_msgs
132
my $num_tests = $num_spam + $num_ham;
133
134
135
# Write logs and scores as C code
136
writescores_c();
137
writetests_c();
138
139
155
sub writescores_c {
140
sub writescores_c {
156
  my $output = '';
141
157
  my $size = 0;
158
  my $mutable = 0;
142
  my $mutable = 0;
159
  my $i;
143
  my $output = '';
144
  my $count = 0;
145
  my $score = 0;
160
146
161
    # jm: now, score-ranges-from-freqs has tflags to work from, so
147
  foreach my $rule (sort {($b->{ismutable} <=> $a->{ismutable}) ||
162
    # it will always list all mutable tests.
148
			  ($a->{name} cmp $b->{name}) } @$rules) {
163
149
164
  @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) ||
150
    $score = $rule->{score};
165
			  ($mutable_tests{$b} <=> $mutable_tests{$a}) ||
166
			   ($a cmp $b)} (keys %scores);
167
  my $max_hits_per_msg = 0;
168
  for ($file = 0; $file < $num_tests; $file++) {
169
    my(@hits) =
170
     grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (@{$tests_hit{$file}});
171
    if ((scalar(@hits)+1) > $max_hits_per_msg) {
172
      $max_hits_per_msg = scalar(@hits)+1;
173
    }
174
  }
175
151
176
  for ($i = 0; $i <= $#index_to_rule; $i++) {
152
    # ignored rules (i.e. no scores)
177
    my $name = $index_to_rule[$i];
153
    next unless $score;
178
    $rule_to_index{$name} = $i;
179
154
180
    if ($ignored_rule{$name}) { next; }
155
    # also ignore rules with score range 0
156
    next if (!$rule->{range_lo} && !$rule->{range_hi});
181
157
182
    if ($mutable_tests{$name} == 0) {
158
    # Set an index
183
      $range_lo{$name} = $range_hi{$name} = $scores{$name};
159
    $rule->{index} = $count;
184
    } else {
160
    $index_to_rule[$count] = $rule; # add the reference to the array
161
162
    if ($rule->{ismutable}) {
185
      $mutable++;
163
      $mutable++;
186
      if ($range_lo{$name} > $range_hi{$name}) {
164
      if ($score > $rule->{range_hi}) {
187
	($range_lo{$name},$range_hi{$name}) =
165
	$score = $rule->{range_hi} - 0.001;
188
	 ($range_hi{$name},$range_lo{$name});
166
      } elsif ($score < $rule->{range_lo}) {
167
	$score = $rule->{range_lo} + 0.001;
189
      }
168
      }
190
      #$range_lo{$name} ||= 0.1;
191
      #$range_hi{$name} ||= 1.5;
192
    }
169
    }
170
    # These should all be set properly if not mutable
171
    # score = range_lo = range_hi
172
    else {
173
      warn "hi != lo for " . $rule->{name} . "!" if $rule->{range_lo} != $rule->{range_hi};
174
      $score = $rule->{range_hi} = $rule->{range_lo};
175
    }
193
176
194
    $output .= ".".$i."\n".
177
    $output .= "." . $count . "\n" .
195
                "n".$name."\n".
178
         "n" . $rule->{name} . "\n" .
196
                "b".$scores{$name}."\n".
179
	 "b" . $score . "\n" .
197
                "m".$mutable_tests{$name}."\n".
180
	 "m" . $rule->{ismutable} . "\n" .
198
                "l".$range_lo{$name}."\n".
181
	 "l" . $rule->{range_lo} . "\n" .
199
                "h".$range_hi{$name}."\n";
182
	 "h" . $rule->{range_hi} . "\n";
200
    $size++;
183
184
    $count++;
185
201
  }
186
  }
202
187
188
  # Output this
203
189
204
  open (DAT, ">tmp/scores.data");
190
  open (DAT, ">$opt_o/scores.data");
205
  print DAT "N$size\n", "M$mutable\n", # informational only
191
  print DAT "N$count\n", "M$mutable\n"; # informational
206
   $output;
192
  print DAT $output;
207
  close DAT;
193
  close DAT;
208
194
209
  open (OUT, ">tmp/scores.h");
195
  open (OUT, ">$opt_o/scores.h");
210
  print OUT "
196
  print OUT <<EOF;
211
#include <stdio.h>
197
#include <stdio.h>
212
#include <string.h>
198
#include <string.h>
213
#include <stdlib.h>
199
#include <stdlib.h>
214
200
 
215
int num_scores = $size;
201
int num_scores = $count;
216
int num_mutable = $mutable;
202
int num_mutable = $mutable;
217
unsigned char is_mutable[$size];
203
unsigned char is_mutable[$count];
218
double range_lo[$size];
204
double range_lo[$count];
219
double range_hi[$size];
205
double range_hi[$count];
220
double bestscores[$size];
206
double bestscores[$count];
221
char *score_names[$size];
207
char *score_names[$count];
222
double tmp_scores[$size][2];
208
double tmp_scores[$count][2];
223
unsigned char ny_hit[$mutable];
209
unsigned char ny_hit[$mutable];
224
unsigned char yn_hit[$mutable];
210
unsigned char yn_hit[$mutable];
225
211
 
226
double lookup[$mutable];
212
double lookup[$mutable];
227
213
 
228
/* readscores() is defined in tests.h */
214
/* readscores() is defined in tests.h */
215
EOF
229
216
230
";
231
  close OUT;
217
  close OUT;
232
218
233
  writetests_c($max_hits_per_msg); # make sure $rule_to_index is around
234
}
219
}
235
220
221
236
sub writetests_c {
222
sub writetests_c {
237
  my $max_hits_per_msg = $_[0];
238
223
239
  my(%uniq_files) = ();
224
  my $max_hits_per_msg = 0;
240
  my(%count_keys) = ();
225
  my @goodtests;
241
  my(%file_key) = ();
226
  my %uniq_logs;
227
  my $uniq_key;
242
228
243
  my $file;
229
  my $i = 0;
244
230
245
  for ($file = 0; $file < $num_tests; $file++)
231
  # This will "compress" the logs so that one log entry can have a
246
  {
232
  # "count" of n indicating it reprents n similar messages
247
    my $uniq_key = $is_spam{$file} . " ";
248
233
249
    my(@good_tests) =
234
  foreach my $log (@$logs) {
250
     grep {length($_) && (! $ignored_rule{$_}) &&
251
	    (defined($rule_to_index{$_}))} (@{ $tests_hit{$file} });
252
235
253
    @good_tests = sort {$a <=> $b} (map {$rule_to_index{$_}} (@good_tests));
236
    (@goodtests) = grep {exists($_->{index})} (@{$log->{tests_hit}});
237
    @goodtests = sort {$a <=> $b} map {$_->{index}} @goodtests;
254
238
255
    $uniq_key .= join(" ",@good_tests);
239
    if($max_hits_per_msg < scalar(@goodtests)) {
240
      $max_hits_per_msg = scalar(@goodtests);
241
    }
256
242
257
    if (exists($count_keys{$uniq_key})) {
243
    $uniq_key = $log->{isspam} ? "s" : "";
258
      $count_keys{$uniq_key}++;
244
    $uniq_key .= join(" ", @goodtests);
245
246
247
    # The %count_keys hash's entries will be the log info for each unique log
248
    # $log->{count} is increased to indicate similar logs
249
250
    if (exists($uniq_logs{$uniq_key})) {
251
      $uniq_logs{$uniq_key}->{count}++;
259
    } else {
252
    } else {
260
      $count_keys{$uniq_key} = 1;
253
      $uniq_logs{$uniq_key} = $log;
261
      $file_key{$file} = $uniq_key;
254
      $uniq_logs{$uniq_key}->{count} = 1;
262
      $uniq_files{$file} = scalar(keys(%count_keys)) - 1;
263
    }
255
    }
256
264
  }
257
  }
265
258
266
  my $num_nondup = scalar(keys(%uniq_files));
259
  my $num_nondup = scalar(keys %uniq_logs);
267
260
268
  open (TOP, ">tmp/tests.h");
261
  open TOP, ">$opt_o/tests.h";
269
  print TOP "
262
  print TOP <<EOF;
270
#include <stdio.h>
263
#include <stdio.h>
271
#include <string.h>
264
#include <string.h>
272
#include <stdlib.h>
265
#include <stdlib.h>
Lines 274-280 Link Here
274
int num_tests = $num_tests;
267
int num_tests = $num_tests;
275
int num_nondup = $num_nondup;
268
int num_nondup = $num_nondup;
276
int num_spam = $num_spam;
269
int num_spam = $num_spam;
277
int num_nonspam = $num_nonspam;
270
int num_nonspam = $num_ham;
278
int max_hits_per_msg = $max_hits_per_msg;
271
int max_hits_per_msg = $max_hits_per_msg;
279
unsigned char num_tests_hit[$num_nondup];
272
unsigned char num_tests_hit[$num_nondup];
280
unsigned char is_spam[$num_nondup];
273
unsigned char is_spam[$num_nondup];
Lines 282-477 Link Here
282
double scores[$num_nondup];
275
double scores[$num_nondup];
283
double tmp_total[$num_nondup];
276
double tmp_total[$num_nondup];
284
int tests_count[$num_nondup];
277
int tests_count[$num_nondup];
278
EOF
285
279
286
";
280
287
  $_ = join ('', <DATA>);
281
  print TOP join('', <DATA>);
288
  print TOP $_;
289
  close TOP;
282
  close TOP;
290
283
291
  open (DAT, ">tmp/tests.data");
292
284
293
  foreach $file (sort {$a <=> $b} (keys %uniq_files)) {
285
  open (DAT, ">$opt_o/tests.data");
294
    print DAT ".".$uniq_files{$file}."\n";
295
286
296
    my $out = '';
287
  my $out;
297
    $out .= "s".$is_spam{$file}."\n";
288
  my $base_score;
289
  my $num_tests_hit;
298
290
299
    my $base_score = 0;
291
  $i = 0;
300
    my $num_tests_hit = 0;
292
  foreach my $log (values %uniq_logs) {
301
    foreach my $test (@{$tests_hit{$file}}) {
293
    $out = '';
302
      if ($test eq '') { next; }
294
    $base_score = $num_tests_hit = 0;
303
295
304
      if ($ignored_rule{$test}) {
296
    print DAT "." . $i . "\n";
305
        warn "ignored rule $test got a hit in $file!\n";
297
306
        next;
298
    $out .= "s" . ( ($log->{isspam})? 1 : 0 ) . "\n";
299
300
    foreach my $test (@{$log->{tests_hit}}) {
301
      if (!$test->{score}) {
302
	# Don't really know why this happens, but the old logs-to-c
303
	#did it too
304
305
	warn "ignored rule " . $test->{name} . " got a hit!";
306
	next;
307
      }
307
      }
308
308
309
      if (!defined $rule_to_index{$test}) {
309
      if (!$test->{range_lo} && !$test->{range_hi}) {
310
	warn "test with no C index: $test\n";
310
	# We ignored this rule
311
	next;
311
	next;
312
      }
312
      }
313
313
314
      if ($mutable_tests{$test}) {
314
      # debugging...
315
      $num_tests_hit++;
315
      if (!defined $test->{index}) {
316
      $out .= "t".$rule_to_index{$test}."\n";
316
	warn "test with no index";
317
318
      if ($num_tests_hit >= $max_hits_per_msg) {
319
	die "Need to increase \$max_hits_per_msg";
320
      }
317
      }
321
      } else {
322
	$base_score += $scores{$test};
323
      }
324
    }
325
318
326
    $out .= "b" . $base_score . "\n"; # score to add in for non-mutable tests
319
      if ($test->{ismutable}) {
327
    $out .= "c" . $count_keys{$file_key{$file}} . "\n";
320
	$num_tests_hit++;
321
	$out .= "t".$test->{index}."\n";
328
322
329
    print DAT "n".$num_tests_hit."\n".$out;
323
	if ($num_tests_hit >= $max_hits_per_msg) {
330
  }
324
	  die "\$max_hits_per_msg not big enough!";
331
  close DAT;
325
	}
332
}
333
326
334
sub read_ranges {
327
      }
335
  if (!-f 'tmp/ranges.data') {
328
      else {
336
    system ("make tmp/ranges.data");
329
	$base_score += $test->{score};
337
  }
330
      }
338
331
339
  # read ranges, and mutableness, from ranges.data.
340
  open (IN, "<tmp/ranges.data")
341
  	or die "need to run score-ranges-from-freqs first!";
342
343
  my $count = 0;
344
  while (<IN>) {
345
    /^(\S+) (\S+) (\d+) (\S+)$/ or next;
346
    my $t = $4;
347
    $range_lo{$t} = $1+0;
348
    $range_hi{$t} = $2+0;
349
    my $mut = $3+0;
350
351
    if ($allrules{$t}->{issubrule}) {
352
      $ignored_rule{$t} = 1;
353
      $mutable_tests{$t} = 0;
354
      next;
355
    }
332
    }
356
    if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) {
357
      #warn "ignored rule: score and range == 0: $t\n";
358
      $ignored_rule{$t} = 1;
359
      $mutable_tests{$t} = 0;
360
      next;
361
    }
362
333
363
    $ignored_rule{$t} = 0;
334
    $out .= "b" . $base_score . "\n"; # score to add for non-mutable tests
364
    $index_to_rule[$count] = $t;
335
    $out .= "c" . $log->{count} . "\n"; # number of identical logs
365
    $count++;
366
336
367
    if (!$mut) {
337
    print DAT "n" . $num_tests_hit . "\n" . $out;
368
      $mutable_tests{$t} = 0;
369
    } elsif ($range_lo{$t} == $range_hi{$t}) {
370
      $mutable_tests{$t} = 0;
371
    } elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
372
      $mutable_tests{$t} = 0;
373
    } else {
374
      $mutable_tests{$t} = 1;
375
    }
376
    unless ($mutable_tests{$t} || $scores{$t}) {
377
      $ignored_rule{$t} = 1;
378
    }
379
  }
380
  close IN;
381
338
382
  # catch up on the ones missed; seems to be userconf or 0-hitters mostly.
339
    $i++;
383
  foreach my $t (sort keys %allrules) {
384
    next if (exists($range_lo{$t}));
385
    if ($allrules{$t}->{issubrule}) {
386
      $ignored_rule{$t} = 1;
387
      $mutable_tests{$t} = 0;
388
      next;
389
    }
390
    $ignored_rule{$t} = 0;
391
    unless (exists($mutable_tests{$t}) &&
392
	    ($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) {
393
      $mutable_tests{$t} = 0;
394
    }
395
    unless ($mutable_tests{$t} || $scores{$t}) {
396
      $ignored_rule{$t} = 1;
397
    }
398
    $index_to_rule[$count] = $t;
399
    $count++;
400
  }
340
  }
401
  foreach my $t (keys %range_lo) {
402
    next if ($ignored_rule{$t});
403
    if ($mutable_tests{$t}) {
404
      if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
405
	$scores{$t} = -1;
406
      } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
407
	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
408
	$scores{$t} = -0.01;
409
      }
410
      if ($scores{$t} >= $range_hi{$t}) {
411
	$scores{$t} = $range_hi{$t} - 0.001;
412
      } elsif ($scores{$t} <= $range_lo{$t}) {
413
	$scores{$t} = $range_lo{$t} + 0.001;
414
      }
415
    } else {
416
      if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
417
	next;
418
      } elsif ($range_lo{$t} == $range_hi{$t}) {
419
	$scores{$t} = $range_lo{$t};
420
	next;
421
      }
422
      if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
423
	$scores{$t} = -1;
424
      } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
425
	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
426
	$scores{$t} = -0.01;
427
      }
428
      if ($scores{$t} > $range_hi{$t}) {
429
	$scores{$t} = $range_hi{$t};
430
      } elsif ($scores{$t} < $range_lo{$t}) {
431
	$scores{$t} = $range_lo{$t};
432
      }
433
    }
434
  }
435
}
436
341
437
sub evaluate {
342
  close DAT;
438
   my $fprate = ($ga_ny / $num_tests) * 100.0;
439
   my $fnrate = ($ga_yn / $num_tests) * 100.0;
440
343
441
   printf ("\n# SUMMARY for threshold %3.1f:\n", $threshold);
442
   printf "# Correctly non-spam: %6d  %4.2f%%  (%4.2f%% of non-spam corpus)\n", $ga_nn,
443
       ($ga_nn /  $num_tests) * 100.0, ($ga_nn /  $num_nonspam) * 100.0;
444
   printf "# Correctly spam:     %6d  %4.2f%%  (%4.2f%% of spam corpus)\n" , $ga_yy,
445
       ($ga_yy /  $num_tests) * 100.0, ($ga_yy /  $num_spam) * 100.0;
446
   printf "# False positives:    %6d  %4.2f%%  (%4.2f%% of nonspam, %6.0f weighted)\n", $ga_ny,
447
       $fprate, ($ga_ny /  $num_nonspam) * 100.0, $nyscore*$nybias;
448
   printf "# False negatives:    %6d  %4.2f%%  (%4.2f%% of spam, %6.0f weighted)\n", $ga_yn,
449
       $fnrate, ($ga_yn /  $num_spam) * 100.0, $ynscore;
450
344
451
  # convert to the TCR metrics used in the published lit
452
  my $nspamspam = $ga_yy;
453
  my $nspamlegit = $ga_yn;
454
  my $nlegitspam = $ga_ny;
455
  my $nlegitlegit = $ga_yn;
456
  my $nlegit = $num_nonspam;
457
  my $nspam = $num_spam;
458
459
  my $werr = ($lambda * $nlegitspam + $nspamlegit)
460
                  / ($lambda * $nlegit + $nspam);
461
462
  my $werr_base = $nspam
463
                  / ($lambda * $nlegit + $nspam);
464
465
  $werr ||= 0.000001;     # avoid / by 0
466
  my $tcr = $werr_base / $werr;
467
468
  my $sr = ($nspamspam / $nspam) * 100.0;
469
  my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
470
  printf "# TCR: %3.6f  SpamRecall: %3.3f%%  SpamPrec: %3.3f%%  FP: %3.2f%%  FN: %3.2f%%\n", $tcr, $sr, $sp, $fprate, $fnrate;
471
}
345
}
472
346
473
__DATA__
474
347
348
__DATA__
475
void loadtests (void) {
349
void loadtests (void) {
476
  FILE *fin = fopen ("tmp/tests.data", "r");
350
  FILE *fin = fopen ("tmp/tests.data", "r");
477
  char buf[256];
351
  char buf[256];
Lines 557-560 Link Here
557
431
558
  printf ("Read scores for %d tests.\n", num_scores);
432
  printf ("Read scores for %d tests.\n", num_scores);
559
}
433
}
560
(-)masses/post-ga-analysis.pl (-27 / +11 lines)
Lines 7-15 Link Here
7
my %scores;
7
my %scores;
8
my %rulehit;
8
my %rulehit;
9
9
10
open(SPAM, "<spam.log");
10
open(LOGS, "<masses.log");
11
open(NONSPAM, "<nonspam.log");
11
open(SCORES, "<perceptron.scores");
12
open(SCORES, "<newscores");
13
12
14
while(<SCORES>)
13
while(<SCORES>)
15
{
14
{
Lines 22-32 Link Here
22
21
23
close(SCORES);
22
close(SCORES);
24
23
25
while(<SPAM>)
24
while(<LOGS>)
26
{
25
{
27
    next if /^#/;
26
    next if /^#/;
28
    /.\s+[-0-9]*\s+[^\s]+\s+([^\s]*)(\s+?:(?:bayes|time)=\S+)\s*?$/;
27
    /(.)\s+.\s+[-0-9]*\s+[^\s]+\s+([^\s]*)(\s+?:(?:bayes|time)=\S+)\s*?$/;
29
    my @rules=split /,/,$1;
28
    my $class = $1;
29
    my @rules=split /,/,$2;
30
    my $score = 0.0;
30
    my $score = 0.0;
31
    foreach $rule (@rules)
31
    foreach $rule (@rules)
32
    {
32
    {
Lines 35-41 Link Here
35
	$rulehit{$rule}++;
35
	$rulehit{$rule}++;
36
    }
36
    }
37
37
38
    if($score < 5)
38
    if($class eq "s" && $score < 5)
39
    {
39
    {
40
	foreach $rule (@rules)
40
	foreach $rule (@rules)
41
	{
41
	{
Lines 44-70 Link Here
44
	}
44
	}
45
	$nfn++;
45
	$nfn++;
46
    }
46
    }
47
}
47
    if($class eq "h" && score >= 5)
48
49
close(SPAM);
50
51
while(<NONSPAM>)
52
{
53
    next if /^#/;
54
    /.\s+[-0-9]*\s+[^\s]+\s+([^\s]*)\s*$/;
55
    next unless defined($1);
56
57
    my @rules=split /,/,$1;
58
    my $score = 0.0;
59
    foreach $rule (@rules)
60
    {
48
    {
61
        next unless (defined ($scores{$rule}));
62
	$score += $scores{$rule};
63
	$rulehit{$rule}++;
64
    }
65
66
    if($score >= 5)
67
    {
68
	foreach $rule (@rules)
49
	foreach $rule (@rules)
69
	{
50
	{
70
            next unless (defined ($scores{$rule}));
51
            next unless (defined ($scores{$rule}));
Lines 72-79 Link Here
72
	}
53
	}
73
	$nfp++;
54
	$nfp++;
74
    }
55
    }
56
75
}
57
}
76
58
59
close(LOGS);
60
77
@fpk = sort { $falsepos{$b}/($rulehit{$b}||0.0001) <=> $falsepos{$a}/($rulehit{$a}||0.00001) } keys %falsepos;
61
@fpk = sort { $falsepos{$b}/($rulehit{$b}||0.0001) <=> $falsepos{$a}/($rulehit{$a}||0.00001) } keys %falsepos;
78
62
79
print "COMMON FALSE POSITIVES: ($nfp total)\n-----------------------\n\n";
63
print "COMMON FALSE POSITIVES: ($nfp total)\n-----------------------\n\n";
(-)masses/convert-old-logs-to-new (+15 lines)
Line 0 Link Here
1
#!/bin/sh -e
2
3
cat spam.log | perl -ne's/^Y/s s/; s/^\./s h/; print unless /^\#/;' \
4
  > spam.log.sorted
5
6
cat ham.log | perl -ne's/^Y/h s/; s/^\./h h/; print unless /^\#/;' \
7
  > ham.log.sorted
8
9
# sort by time
10
11
echo \# SVN revision: > masses.log
12
13
sort --field-separator='=' -n -k2,2 --merge spam.log.sorted ham.log.sorted \
14
  >> masses.log
15
0
  + *
16
  + *
(-)masses/score-ranges-from-freqs (-251 lines)
Lines 1-251 Link Here
1
#!/usr/bin/perl -w
2
#
3
# <@LICENSE>
4
# Copyright 2004 Apache Software Foundation
5
# 
6
# Licensed under the Apache License, Version 2.0 (the "License");
7
# you may not use this file except in compliance with the License.
8
# You may obtain a copy of the License at
9
# 
10
#     http://www.apache.org/licenses/LICENSE-2.0
11
# 
12
# Unless required by applicable law or agreed to in writing, software
13
# distributed under the License is distributed on an "AS IS" BASIS,
14
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
# See the License for the specific language governing permissions and
16
# limitations under the License.
17
# </@LICENSE>
18
19
# (rough) graphic demo of this algorithm:
20
# 0.0  = -limit [......] 0 ........ limit
21
# 0.25 = -limit ..[..... 0 .]...... limit
22
# 0.5  = -limit ....[... 0 ...].... limit
23
# 0.75 = -limit ......[. 0 .....].. limit
24
# 1.0  = -limit ........ 0 [......] limit
25
my $sliding_window_limits = 4.8; # limits = [-$range, +$range]
26
my $sliding_window_size =   5.5; # scores have this range within limits
27
28
# 0.0  = -limit [......] 0 ........ limit
29
# 0.25 = -limit ....[... 0 ]....... limit
30
# 0.5  = -limit ......[. 0 .]...... limit (note: tighter)
31
# 0.75 = -limit .......[ 0 ...].... limit
32
# 1.0  = -limit ........ 0 [......] limit
33
my $shrinking_window_lower_base =   0.00; 
34
my $shrinking_window_lower_range =  1.00; # *ratio, added to above
35
my $shrinking_window_size_base =    1.00;
36
my $shrinking_window_size_range =   1.00; # *ratio, added to above
37
38
my $use_sliding_window = 0;
39
40
my $argcffile = shift @ARGV;
41
my $scoreset = shift @ARGV;
42
$scoreset = 0 if ( !defined $scoreset );
43
44
if (defined ($argcffile) && $argcffile eq '-test') {
45
  # use this to debug the ranking -> score-range mapping:
46
  for $rat (0.0, 0.25, 0.5, 0.75, 1.0) {
47
    my ($lo, $hi); if ($use_sliding_window) {
48
      ($lo, $hi) = sliding_window_ratio_to_range($rat);
49
    } else {
50
      ($lo, $hi) = shrinking_window_ratio_to_range($rat);
51
    }
52
    warn "test: $rat => [ $lo $hi ]\n";
53
  } exit;
54
}
55
56
my %freq_spam = ();
57
my %freq_nonspam = ();
58
59
my $num_spam;
60
my $num_nonspam;
61
my $num_total;
62
63
my %mutable_tests = ();
64
my %ranking = ();
65
my %soratio = ();
66
my %is_nice = ();
67
68
if (!defined $argcffile) { $argcffile = "../rules"; }
69
system ("./parse-rules-for-masses -d \"$argcffile\" -s $scoreset") and die;
70
if (-e "tmp/rules.pl") {
71
  # Note, the spaces need to stay in front of the require to work around a RPM 4.1 problem
72
  require "./tmp/rules.pl";
73
}
74
else {
75
  die "parse-rules-for-masses had no error but no tmp/rules.pl!?!";
76
}
77
78
while (<>) {
79
  /^\s*([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+\S+\s+(.+)\s*$/ or next;
80
81
  my $overall = $1+0;
82
  my $spam = $2+0;
83
  my $nonspam = $3+0;
84
  my $soratio = $4+0;
85
  my $ranking = $5+0;
86
  my $test = $6;
87
88
  if ($test eq '(all messages)') {
89
    $num_spam = $spam;
90
    $num_nonspam = $nonspam;
91
    $num_total = $spam+$nonspam;
92
    next;
93
  }
94
  next if ($test eq '(all messages as %)');
95
96
  if (!defined ($rules{$test})) {
97
    warn "rule $test no longer exists; ignoring\n";
98
    next;
99
  }
100
101
  $freq{$test} = $overall;
102
  $freq_spam{$test} = $spam;
103
  $freq_nonspam{$test} = $nonspam;
104
105
  my $tflags = $rules{$test}->{tflags}; $tflags ||= '';
106
  if ($tflags =~ /\buserconf\b/ ||
107
      ( ($scoreset % 2) == 0 && $tflags =~ /\bnet\b/ )) {
108
    $mutable_tests{$test} = 0;
109
  } else {
110
    $mutable_tests{$test} = 1;
111
  }
112
  if ($tflags =~ m/\bnice\b/i) {
113
    $is_nice{$test} = 1;
114
  } else {
115
    $is_nice{$test} = 0;
116
  }
117
118
  if ($overall < 0.01) {        # less than 0.01% of messages were hit
119
    $mutable_tests{$test} = 0;
120
    $soratio{$test} = 0.5;
121
    $ranking{$test} = 0.0;
122
    $rules{$test}->{score} = 0; # tvd - disable these rules automagically
123
124
  } else {
125
    $soratio{$test} = $soratio;
126
    $ranking{$test} = $ranking;
127
  }
128
}
129
130
if ( ! mkdir "tmp", 0755 ) {
131
  warn "Couldn't create tmp directory!: $!\n";
132
}
133
134
open (OUT, ">tmp/ranges.data");
135
foreach my $test (sort { $ranking{$b} <=> $ranking{$a} } keys %freq) {
136
  if (!defined ($rules{$test})) {
137
    warn "no rule $test";
138
    print OUT ("0 0 0 $test\n");
139
    next;
140
  }
141
142
  my $overall = $freq{$test};
143
  my $spam = $freq_spam{$test};
144
  my $nonspam = $freq_nonspam{$test};
145
  my $soratio = $soratio{$test};
146
  my $ranking = $ranking{$test};
147
  my $mutable = $mutable_tests{$test};
148
149
  if (!$mutable || $rules{$test}->{score} == 0) { # didn't look for score 0 - tvd
150
    printf OUT ("%3.3f %3.3f 0 $test\n",
151
                         $rules{$test}->{score},
152
                         $rules{$test}->{score});
153
    next;
154
  }
155
156
  # 0.0 = best nice, 1.0 = best nonnice
157
  if ($is_nice{$test}) {
158
    $ranking = .5 - ($ranking / 2);
159
  } else {
160
    $ranking = .5 + ($ranking / 2);
161
  }
162
163
  my ($lo, $hi);
164
  if ($use_sliding_window) {
165
    ($lo, $hi) = sliding_window_ratio_to_range($ranking);
166
  } else {
167
    ($lo, $hi) = shrinking_window_ratio_to_range($ranking);
168
  }
169
170
  # tvd
171
  my $tflags = $rules{$test}->{tflags}; $tflags ||= '';
172
  if ( $is_nice{$test} && ( $ranking < .5 ) ) { # proper nice rule
173
    if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score # -5.4
174
      $lo *=1.8;
175
    }
176
    elsif ($soratio <= 0.05 && $nonspam > 0.5) { # let good rules be larger if they want to, -4.5
177
      $lo *= 1.5;
178
    }
179
180
    $hi =	($soratio == 0) ? $lo :
181
    		($soratio <= 0.005 ) ? $lo/1.1 :
182
    		($soratio <= 0.010 && $nonspam > 0.2) ? $lo/2.0 :
183
		($soratio <= 0.025 && $nonspam > 1.5) ? $lo/10.0 :
184
		0;
185
186
    if ( $soratio >= 0.35 ) { # auto-disable bad rules
187
      ($lo,$hi) = (0,0);
188
    }
189
  }
190
  elsif ( !$is_nice{$test} && ( $ranking >= .5 ) ) { # proper spam rule
191
    if ( $tflags =~ /\blearn\b/ ) { # learn rules should get a higher score
192
      $hi *=1.8;
193
    }
194
    elsif ( $soratio >= 0.99 && $spam > 1.0 ) {
195
      $hi *= 1.5; # let good rules be larger if they want to
196
    }
197
198
    $lo =	($soratio == 1) ? $hi:
199
    		($soratio >= 0.995 ) ? $hi/4.0 :
200
    		($soratio >= 0.990 && $spam > 1.0) ? $hi/8.0 :
201
		($soratio >= 0.900 && $spam > 10.0) ? $hi/24.0 :
202
		0;
203
204
    if ( $soratio <= 0.65 ) { # auto-disable bad rules
205
      ($lo,$hi) = (0,0);
206
    }
207
  }
208
  else { # rule that has bad nice setting
209
    ($lo,$hi) = (0,0);
210
  }
211
  $mutable = 0 if ( $hi == $lo );
212
213
  printf OUT ("%3.1f %3.1f $mutable $test\n", $lo, $hi);
214
}
215
close OUT;
216
exit;
217
218
sub sliding_window_ratio_to_range {
219
  my $ratio = shift;
220
  my $lo = -$sliding_window_limits + ($sliding_window_size * $ratio);
221
  my $hi = +$sliding_window_limits - ($sliding_window_size * (1-$ratio));
222
  if ($lo > $hi) { # ???
223
    ($lo,$hi) = ($hi,$lo);
224
  }
225
  ($lo, $hi);
226
}
227
228
sub shrinking_window_ratio_to_range {
229
  my $ratio = shift;
230
  my $is_nice = 0;
231
  my $adjusted = ($ratio -.5) * 2;      # adj [0,1] to [-1,1]
232
  if ($adjusted < 0) { $is_nice = 1; $adjusted = -$adjusted; }
233
234
#$adjusted /= 1.5 if ( $ratio < 0.95 && $ratio > 0.15 ); # tvd
235
236
  my $lower = $shrinking_window_lower_base 
237
                        + ($shrinking_window_lower_range * $adjusted);
238
  my $range = $shrinking_window_size_base 
239
                        + ($shrinking_window_size_range * $adjusted);
240
  my $lo = $lower;
241
  my $hi = $lower + $range;
242
  if ($is_nice) {
243
    my $tmp = $hi; $hi = -$lo; $lo = -$tmp;
244
  }
245
  if ($lo > $hi) { # ???
246
    ($lo,$hi) = ($hi,$lo);
247
  }
248
249
  ($lo, $hi);
250
}
251
(-)masses/find-extremes (-153 / +182 lines)
Lines 17-54 Link Here
17
# limitations under the License.
17
# limitations under the License.
18
# </@LICENSE>
18
# </@LICENSE>
19
19
20
use Getopt::Std;
21
getopts("l:L:h");
22
20
21
use FindBin;
22
use lib "$FindBin::Bin/../lib";
23
use Mail::SpamAssassin::Masses;
24
use Getopt::Long qw(:config bundling auto_help);
25
use Pod::Usage;
26
use strict;
27
use warnings;
28
23
use vars qw {
29
use vars qw {
24
  $opt_h $opt_l $opt_L
30
$opt_c $opt_s $opt_l $opt_L $opt_inclang
25
};
31
};
26
32
27
sub usage {
33
GetOptions("c|cffile=s@" => \$opt_c,
28
  die "find-extremes [-l LC] [-L LC] [spam log] [nonspam log]
34
           "s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode)
35
           "l|logfile=s" => \$opt_l,
36
           "L|language=s" => \$opt_L,
37
           "include-language=s" => \$opt_inclang);
29
38
30
    -l LC  also print language specific rules for lang code LC (or 'all')
31
    -L LC  only print language specific rules for lang code LC (or 'all')
32
39
33
    options -l and -L are mutually exclusive.
34
40
35
    if either the spam or and nonspam logs are unspecified, the defaults
41
my $lower = 1;
36
    are \"spam.log\" and \"nonspam.log\" in the cwd.
42
#$threshold = 5;
43
my $higher = 9;
44
my $min_expected = 2; # Should not be set to more than 5 or less than 2
37
45
38
";
46
47
=head1 NAME
48
 
49
find-extremes - Determine which rules are most likely to cause false positives/negatives.
50
 
51
=head1 SYNOPSIS
52
 
53
hit-frequencies [options]
54
 
55
 Options:
56
    -c,--cffile=path      Use path as the rules directory
57
    -s,--scoreset=n       Use scoreset n
58
    -l,--logfile=file     Read in file instead of masses.log
59
    -L,--language=lc      Only print language specific tests for specified lang code (try 'all')
60
    --include-language=lc Also print language specific tests for specified lang code (try 'all')
61
 
62
=head1 DESCRIPTION
63
64
B<hit-frequencies> will read the mass-check log F<masses.log> or the
65
log given by the B<--logfile> option. By default, B<hit-frequencies>
66
will assume the proper values for B<--cffile> based on the header of
67
the masses.log. The output will include the following columns:
68
69
=over 4
70
71
=item RULE
72
73
=item CHISQUARE
74
75
=item RATIO_FALSEPOS
76
77
=item OVER_FALSEPOS
78
79
=item FREQ_OVER
80
81
=back
82
83
=head1 BUGS
84
85
This script may or may not work as designed - it probably needs some
86
tweaking, and I probably introduced a bug into it while re-writing for
87
the new Masses stuff. 
88
89
=head1 NOTES
90
91
This script is poorly documented. Patches welcome.
92
93
=cut
94
95
96
$opt_s = 0 unless defined $opt_s;
97
98
my $ok_lang = lc ( $opt_inclang || $opt_L || '');
99
$ok_lang = '.' if ($ok_lang eq 'all');
100
101
my $greprules = sub {
102
  my ($name, $rule) = @_;
103
104
  return 0 if (($opt_L && !$rule->{lang}) ||
105
           ($rule->{lang} &&
106
            (!$ok_lang || $rule->{lang} !~ /^$ok_lang/i))); # Wrong language
107
108
  return 0 if ($rule->{tflags} =~ /\bnet\b/);
109
110
  return 1;
111
112
};
113
114
$opt_l ||= "masses.log";
115
116
if (!$opt_c || !scalar(@$opt_c)) {
117
    # Try to read this in from the log, if possible
118
    open (IN, $opt_l) or die "Can't open $opt_l: $!";
119
    my $files = 0; # are we in the files section?
120
    while(<IN>) {
121
        if (!$files) {
122
            if (/^\# SVN revision:/) {
123
                $opt_c = [ "$FindBin::Bin/../rules" ];
124
                last;
125
            } elsif (/^\# Using configuration:$/) {
126
                $files = 1;
127
            }
128
        } elsif (/^\#\s+(.*)\s*$/) {
129
            push (@$opt_c, $1);
130
        } else {
131
            # All done!
132
            last;
133
        }
134
    }
135
136
    foreach my $file (@$opt_c) {
137
        die "Can't read $file" unless -r $file;
138
    }
39
}
139
}
40
140
41
usage() if($opt_h || ($opt_l && $opt_L));
141
my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c,
142
                                               scoreset => $opt_s,
143
                                               greprules => $greprules,
144
                                               logfile => $opt_l,
145
                                               nologs => 1});
42
146
43
$lower = 1;
147
$masses->readrules();
44
#$threshold = 5;
148
$masses->readlogs();
45
$higher = 9;
46
$min_expected = 2; # Should not be set to more than 5 or less than 2
47
149
48
my %freq_spam = ();	# how often non-nice found in spam
150
my $rules = $masses->get_rules_hash();
151
my $logs = $masses->get_logs();
152
153
my $num_spam = $masses->get_num_spam();
154
my $num_ham = $masses->get_num_ham();
155
49
my %freq_over_higher_falsepos = (); # how often non-nice found in ones over
156
my %freq_over_higher_falsepos = (); # how often non-nice found in ones over
50
                                    # higher threshold that are false positives
157
                                    # higher threshold that are false positives
51
my %freq_nonspam = ();	# how often nice found in nonspam
52
my %freq_under_lower_falseneg = (); # how often nice found in ones under
158
my %freq_under_lower_falseneg = (); # how often nice found in ones under
53
                                    # lower threshold that are false negatives
159
                                    # lower threshold that are false negatives
54
160
Lines 59-101 Link Here
59
my %ratio_expected_falsepos = (); # ratio version of above
165
my %ratio_expected_falsepos = (); # ratio version of above
60
my %ratio_expected_falseneg = (); # ditto
166
my %ratio_expected_falseneg = (); # ditto
61
167
62
my $num_spam = 0;
63
my $num_nonspam = 0;
64
my $num_over_higher_falsepos = 0;
168
my $num_over_higher_falsepos = 0;
65
my $num_under_lower_falseneg = 0;
169
my $num_under_lower_falseneg = 0;
66
my $ok_lang = '';
67
170
68
readscores();
171
my %chisquare = ( );
172
my %prob = ( );
69
173
70
$ok_lang = lc ($opt_l || $opt_L || '');
71
if ($ok_lang eq 'all') { $ok_lang = '.'; }
72
174
73
foreach my $key (keys %rules) {
175
foreach my $key (keys %$rules) {
74
176
75
  if ( ($opt_L && !$rules{$key}->{lang}) ||
177
  if ($rules->{$key}->{tflags} !~ /\buserconf\b/) {
76
       ($rules{$key}->{lang} &&
178
    if ($rules->{$key}->{tflags} =~ m/nice/) {
77
         (!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i)
78
     ) ) {
79
    delete $rules{$key} ; next;
80
  }
81
82
  if ($rules{$key}->{tflags} =~ m/net/) {
83
    delete $rules{$key};
84
    next;
85
  }
86
  if ($rules{$key}->{tflags} !~ m/userconf/) {
87
    if ($rules{$key}->{tflags} =~ m/nice/) {
88
      $freq_nonspam{$key} = 0;
89
      $freq_under_lower_falseneg{$key} = 0;
179
      $freq_under_lower_falseneg{$key} = 0;
90
    } else {
180
    } else {
91
      $freq_spam{$key} = 0;
92
      $freq_over_higher_falsepos{$key} = 0;
181
      $freq_over_higher_falsepos{$key} = 0;
93
    }
182
    }
94
  }
183
  }
184
95
}
185
}
96
186
97
readlogs();
187
foreach my $log (@$logs) {
98
188
189
  if($log->{isspam}) {
190
    # Also need to count plus_hits
191
    my $plus_hits = 0;
192
    foreach my $test (@{$log->{tests_hit}}) {
193
      $plus_hits += $test->{score} if ($test->{score} > 0);
194
    }
195
196
    if(($log->{score} <= $lower) && $plus_hits && $plus_hits >= $lower) {
197
      $num_under_lower_falseneg++;
198
      foreach my $test (@{$log->{tests_hit}}) {
199
	$num_under_lower_falseneg++;
200
	$freq_under_lower_falseneg{$test->{name}}++ if exists $freq_under_lower_falseneg{$test->{name}};
201
      }
202
    }
203
  }
204
  else {
205
    if($log->{score} > $higher) {
206
      $num_over_higher_falsepos++;
207
      foreach my $test (@{$log->{tests_hit}}) {
208
	$num_over_higher_falsepos++;
209
	$freq_over_higher_falsepos{$test->{name}}++ if exists $freq_over_higher_falsepos{$test->{name}};
210
      }
211
    }
212
  }
213
214
}
215
99
unless (($num_over_higher_falsepos >= $min_expected)
216
unless (($num_over_higher_falsepos >= $min_expected)
100
	&& ($num_under_lower_falseneg >= $min_expected)) {
217
	&& ($num_under_lower_falseneg >= $min_expected)) {
101
  die "Insufficient extremes in dataset (" . $num_over_higher_falsepos .
218
  die "Insufficient extremes in dataset (" . $num_over_higher_falsepos .
Lines 119-130 Link Here
119
}
236
}
120
237
121
my $ratio_falsepos = $num_over_higher_falsepos/$num_spam;
238
my $ratio_falsepos = $num_over_higher_falsepos/$num_spam;
122
my $ratio_falseneg = $num_under_lower_falseneg/$num_nonspam;
239
my $ratio_falseneg = $num_under_lower_falseneg/$num_ham;
123
240
124
my $skipped_non_nice = 0;
241
my $skipped_non_nice = 0;
125
242
126
foreach $rule (keys %freq_spam) {
243
# non-nice rules
127
  my $expected = $freq_spam{$rule}*$ratio_falsepos;
244
foreach my $rule (keys %freq_over_higher_falsepos) {
245
  my $expected = $rules->{$rule}->{freq_spam}*$ratio_falsepos;
128
  if ($expected <= $min_expected) {
246
  if ($expected <= $min_expected) {
129
    $skipped_non_nice++;
247
    $skipped_non_nice++;
130
    next;
248
    next;
Lines 136-142 Link Here
136
   $freq_over_higher_falsepos{$rule}/$expected;
254
   $freq_over_higher_falsepos{$rule}/$expected;
137
  ($chisquare{$rule},$prob{$rule}) =
255
  ($chisquare{$rule},$prob{$rule}) =
138
   chisquare($num_spam,$num_over_higher_falsepos,
256
   chisquare($num_spam,$num_over_higher_falsepos,
139
	     $freq_spam{$rule},$freq_over_higher_falsepos{$rule});
257
	     $rules->{$rule}->{freq_spam},$freq_over_higher_falsepos{$rule});
140
  if ($freq_over_higher_falsepos{$rule} < $expected) {
258
  if ($freq_over_higher_falsepos{$rule} < $expected) {
141
    $chisquare{$rule} *= -1;
259
    $chisquare{$rule} *= -1;
142
  }
260
  }
Lines 146-153 Link Here
146
264
147
my $skipped_nice = 0;
265
my $skipped_nice = 0;
148
266
149
foreach $rule (keys %freq_nonspam) {
267
# nice rules
150
  my $expected = $freq_nonspam{$rule}*$ratio_falseneg;
268
foreach my $rule (keys %freq_under_lower_falseneg) {
269
  my $expected = $rules->{$rule}->{freq_ham}*$ratio_falseneg;
151
  if ($expected <= $min_expected) {
270
  if ($expected <= $min_expected) {
152
    $skipped_nice++;
271
    $skipped_nice++;
153
    next;
272
    next;
Lines 158-165 Link Here
158
  $ratio_expected_falseneg{$rule} =
277
  $ratio_expected_falseneg{$rule} =
159
   $freq_under_lower_falseneg{$rule}/$expected;
278
   $freq_under_lower_falseneg{$rule}/$expected;
160
  ($chisquare{$rule},$prob{$rule}) =
279
  ($chisquare{$rule},$prob{$rule}) =
161
   chisquare($num_nonspam,$num_under_lower_falseneg,
280
   chisquare($num_ham,$num_under_lower_falseneg,
162
	     $freq_nonspam{$rule},$freq_under_lower_falseneg{$rule});
281
	     $rules->{$rule}->{freq_ham},$freq_under_lower_falseneg{$rule});
163
  if ($freq_under_lower_falseneg{$rule} < $expected) {
282
  if ($freq_under_lower_falseneg{$rule} < $expected) {
164
    $chisquare{$rule} *= -1;
283
    $chisquare{$rule} *= -1;
165
  }
284
  }
Lines 167-174 Link Here
167
286
168
warn "Skipped nice: $skipped_nice\n";
287
warn "Skipped nice: $skipped_nice\n";
169
288
170
@rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos);
289
# The rest is copied verbatim from before - its complicated and not
290
# commented and should work unchanged except for the freq_spam and
291
# freq_ham stuff and fixing some use strict stuff
171
292
293
my @rules_falsepos = grep {$prob{$_} < .5} (keys %over_expected_falsepos);
294
172
if (scalar(@rules_falsepos)) {
295
if (scalar(@rules_falsepos)) {
173
  print "RULE\t\tCHISQUARE\tRATIO_FALSEPOS\tOVER_FALSEPOS\tFREQ_OVER ($num_over_higher_falsepos)\n";
296
  print "RULE\t\tCHISQUARE\tRATIO_FALSEPOS\tOVER_FALSEPOS\tFREQ_OVER ($num_over_higher_falsepos)\n";
174
  my(@rules_falsepos_bad) =
297
  my(@rules_falsepos_bad) =
Lines 183-189 Link Here
183
	   $over_expected_falsepos{$a}) ||
306
	   $over_expected_falsepos{$a}) ||
184
	    ($freq_over_higher_falsepos{$b} <=>
307
	    ($freq_over_higher_falsepos{$b} <=>
185
	     $freq_over_higher_falsepos{$a})} (@rules_falsepos_bad);
308
	     $freq_over_higher_falsepos{$a})} (@rules_falsepos_bad);
186
    foreach $rule (@rules_falsepos_bad) {
309
    foreach my $rule (@rules_falsepos_bad) {
187
      print $rule . "\t" . $prob{$rule} . "\t" .
310
      print $rule . "\t" . $prob{$rule} . "\t" .
188
       $ratio_expected_falsepos{$rule} . "\t" .
311
       $ratio_expected_falsepos{$rule} . "\t" .
189
	$over_expected_falsepos{$rule} . "\t" .
312
	$over_expected_falsepos{$rule} . "\t" .
Lines 199-207 Link Here
199
       ($chisquare{$a} <=> $chisquare{$b}) ||
322
       ($chisquare{$a} <=> $chisquare{$b}) ||
200
	($ratio_expected_falsepos{$a} <=>
323
	($ratio_expected_falsepos{$a} <=>
201
	 $ratio_expected_falsepos{$b}) ||
324
	 $ratio_expected_falsepos{$b}) ||
202
	  ($freq_spam{$b} <=>
325
	  ($rules->{$b}->{freq_spam} <=>
203
	   $freq_spam{$a})} (@rules_falsepos_good);
326
	   $rules->{$a}->{freq_spam})} (@rules_falsepos_good);
204
    foreach $rule (@rules_falsepos_good) {
327
    foreach my $rule (@rules_falsepos_good) {
205
      print $rule . "\t" . $prob{$rule} . "\t" .
328
      print $rule . "\t" . $prob{$rule} . "\t" .
206
       $ratio_expected_falsepos{$rule} . "\t" .
329
       $ratio_expected_falsepos{$rule} . "\t" .
207
	$over_expected_falsepos{$rule} . "\t" .
330
	$over_expected_falsepos{$rule} . "\t" .
Lines 212-218 Link Here
212
  warn "No over-falsepos to print\n";
335
  warn "No over-falsepos to print\n";
213
}
336
}
214
337
215
@rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg);
338
my @rules_falseneg = grep {$prob{$_} < .5} (keys %over_expected_falseneg);
216
339
217
if (scalar(@rules_falseneg)) {
340
if (scalar(@rules_falseneg)) {
218
  print "RULE\t\tCHISQUARE\tRATIO_FALSENEG\tOVER_FALSENEG\tFREQ_UNDER ($num_under_lower_falseneg)\n";
341
  print "RULE\t\tCHISQUARE\tRATIO_FALSENEG\tOVER_FALSENEG\tFREQ_UNDER ($num_under_lower_falseneg)\n";
Lines 228-234 Link Here
228
	   $over_expected_falseneg{$a}) ||
351
	   $over_expected_falseneg{$a}) ||
229
	    ($freq_under_lower_falseneg{$b} <=>
352
	    ($freq_under_lower_falseneg{$b} <=>
230
	     $freq_under_lower_falseneg{$a})} (@rules_falseneg_bad);
353
	     $freq_under_lower_falseneg{$a})} (@rules_falseneg_bad);
231
    foreach $rule (@rules_falseneg_bad) {
354
    foreach my $rule (@rules_falseneg_bad) {
232
      print $rule . "\t" . $prob{$rule} . "\t" .
355
      print $rule . "\t" . $prob{$rule} . "\t" .
233
       $ratio_expected_falseneg{$rule} . "\t" .
356
       $ratio_expected_falseneg{$rule} . "\t" .
234
	$over_expected_falseneg{$rule} . "\t" .
357
	$over_expected_falseneg{$rule} . "\t" .
Lines 244-252 Link Here
244
       ($chisquare{$a} <=> $chisquare{$b}) ||
367
       ($chisquare{$a} <=> $chisquare{$b}) ||
245
	($ratio_expected_falseneg{$a} <=>
368
	($ratio_expected_falseneg{$a} <=>
246
	 $ratio_expected_falseneg{$b}) ||
369
	 $ratio_expected_falseneg{$b}) ||
247
	  ($freq_spam{$b} <=>
370
	  ($rules->{$b}->{freq_ham} <=>
248
	   $freq_spam{$a})} (@rules_falseneg_good);
371
	   $rules->{$a}->{freq_ham})} (@rules_falseneg_good);
249
    foreach $rule (@rules_falseneg_good) {
372
    foreach my $rule (@rules_falseneg_good) {
250
      print $rule . "\t" . $prob{$rule} . "\t" .
373
      print $rule . "\t" . $prob{$rule} . "\t" .
251
       $ratio_expected_falseneg{$rule} . "\t" .
374
       $ratio_expected_falseneg{$rule} . "\t" .
252
	$over_expected_falseneg{$rule} . "\t" .
375
	$over_expected_falseneg{$rule} . "\t" .
Lines 258-354 Link Here
258
}
381
}
259
382
260
exit;
383
exit;
261
262
sub readlogs {
263
  my $spam = $ARGV[0] || "spam.log";
264
  my $nonspam = $ARGV[1] || (-f "good.log" ? "good.log" : "nonspam.log");
265
266
267
  (open(NONSPAM,$nonspam)) ||
268
   (die "Couldn't open file '$nonspam': $!; stopped");
269
270
  while (defined($line = <NONSPAM>)) {
271
    if ($line =~ m/^\s*\#/) {
272
      next;
273
    } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) {
274
      my $tests = $1;
275
      my $hits = 0;
276
      my(@tests) = ();
277
      foreach $test (grep {length($_)} (split(/,+/,$tests))) {
278
	if (exists($rules{$test})) {
279
	  push @tests, $test;
280
	  $hits += $rules{$test}->{score};
281
	}
282
      }
283
      
284
      if (scalar(@tests)) {
285
	$num_nonspam++;
286
	foreach $test (grep {exists($freq_nonspam{$_})} (@tests)) {
287
	  $freq_nonspam{$test}++;
288
	}
289
	if ($hits >= $higher) {
290
	  $num_over_higher_falsepos++;
291
	  foreach $test (grep
292
			 {exists($freq_over_higher_falsepos{$_})} (@tests)) {
293
	    $freq_over_higher_falsepos{$test}++;
294
	  }
295
	}
296
      }
297
    } elsif ($line =~ m/\S/) {
298
      chomp($line);
299
      warn "Can't interpret line '$line'; skipping";
300
    }
301
  }
302
303
  close(NONSPAM);
304
305
  (open(SPAM,$spam)) || (die "Couldn't open file '$spam': $!; stopped");
306
307
  while (defined($line = <SPAM>)) {
308
    if ($line =~ m/^\s*\#/) {
309
      next;
310
    } elsif ($line =~ m/^.\s+-?\d+\s+\S+\s*(\S*)/) {
311
      my $tests = $1;
312
      my $hits = 0;
313
      my $plus_hits = 0;
314
      my(@tests) = ();
315
      foreach $test (grep {length($_)} (split(/,+/,$tests))) {
316
	if (exists($rules{$test})) {
317
	  push @tests, $test;
318
	  $hits += $rules{$test}->{score};
319
	  if ($rules{$test}->{score} > 0) {
320
	    $plus_hits += $rules{$test}->{score};
321
	  }
322
	}
323
      }
324
      
325
      if (scalar(@tests)) {
326
	$num_spam++;
327
	foreach $test (grep {exists($freq_spam{$_})} (@tests)) {
328
	  $freq_spam{$test}++;
329
	}
330
	if (($hits <= $lower) && $plus_hits &&
331
	    ($plus_hits >= $lower)) {
332
	  $num_under_lower_falseneg++;
333
	  foreach $test (grep
334
			 {exists($freq_under_lower_falseneg{$_})} (@tests)) {
335
	    $freq_under_lower_falseneg{$test}++;
336
	  }
337
	}
338
      }
339
    } elsif ($line =~ m/\S/) {
340
      chomp($line);
341
      warn "Can't interpret line '$line'; skipping";
342
    }
343
  }
344
345
  close(SPAM);
346
}
347
348
349
sub readscores {
350
  system ("./parse-rules-for-masses") and
351
   die "Couldn't do parse-rules-for-masses: $?; stopped";
352
  require "./tmp/rules.pl";
353
}
354
(-)masses/tenpass/10pass-compute-tcr (-4 / +4 lines)
Lines 6-17 Link Here
6
do
6
do
7
  mkdir tmp/10passrules > /dev/null 2>&1
7
  mkdir tmp/10passrules > /dev/null 2>&1
8
  cp ../rules/[0-9]*.cf tmp/10passrules
8
  cp ../rules/[0-9]*.cf tmp/10passrules
9
  ./rewrite-cf-with-new-scores $SCORESET ../rules/50_scores.cf \
9
  ./rewrite-cf-with-new-scores -s $SCORESET --old=../rules/50_scores.cf \
10
	tenpass_results/scores.$run > tmp/10passrules/50_scores.cf
10
	--new=tenpass_results/scores.$run --out=tmp/10passrules/50_scores.cf \
11
        --cffile=../rules
11
12
12
  ./fp-fn-statistics --cffile=tmp/10passrules \
13
  ./fp-fn-statistics --cffile=tmp/10passrules \
13
	--spam=tenpass_results/spam.log.$run \
14
	--logfile=tenpass_results/masses.log.$run > tmp/stats
14
	--nonspam=tenpass_results/ham.log.$run > tmp/stats
15
15
16
  grep TCR: tmp/stats
16
  grep TCR: tmp/stats
17
done
17
done
(-)masses/tenpass/10pass-run (-17 / +13 lines)
Lines 1-13 Link Here
1
#!/bin/sh
1
#!/bin/sh
2
2
3
# change these!
3
# change these!
4
NSBASE=ham-logs
4
BASE=logs/
5
SPBASE=spam-logs
6
SCORESET="0"
7
5
8
passes="1 2 3 4 5 6 7 8 9 10"
6
passes="1 2 3 4 5 6 7 8 9 10"
9
mkdir -p tenpass_results
7
mkdir tenpass_results
10
mkdir -p ORIG
11
8
12
> make.output
9
> make.output
13
10
Lines 17-44 Link Here
17
  echo "Training for corpus $id..."
14
  echo "Training for corpus $id..."
18
  pwd; date
15
  pwd; date
19
16
20
  > ORIG/ham-set$SCORESET.log
17
  > masses.log
21
  > ORIG/spam-set$SCORESET.log
22
23
  echo -n "(using corpora blocks: "
18
  echo -n "(using corpora blocks: "
24
  for notid in $passes ; do
19
  for notid in $passes ; do
25
    if [ "$notid" != "$id" ] ; then
20
    if [ "$notid" != "$id" ] ; then
26
      echo -n "$notid "
21
      echo -n "$notid "
27
      cat $NSBASE/split-$notid.log >> ORIG/ham-set$SCORESET.log
22
      cat $BASE/split-$notid.log >> masses.log
28
      cat $SPBASE/split-$notid.log >> ORIG/spam-set$SCORESET.log
29
    fi
23
    fi
30
  done
24
  done
31
  echo "for training)"
25
  echo "for training)"
32
26
33
  make clean >> make.output
27
  make clean >> make.output
34
  make >> make.output 2>&1
28
  make perceptron 2>&1 >> make.output
35
  ./runGA
29
  ./perceptron
36
  pwd
30
  pwd; date
37
  date
38
31
39
  echo "Saving test data for corpus $id..."
32
  echo "Saving test data for corpus $id..."
40
33
41
  cp $NSBASE/split-$id.log tenpass_results/ham.log.$id
34
  cp $BASE/split-$id.log tenpass_results/masses.log.$id
42
  cp $SPBASE/split-$id.log tenpass_results/spam.log.$id
35
43
  cp gen-set$SCORESET.scores tenpass_results/scores.$id
36
  cp perceptron.scores tenpass_results/scores.$id
37
44
done
38
done
39
40

Return to bug 2853