#!/usr/bin/perl -w # Usage: sa33badrules.pl [attachment_number [ham [diff]]] # Where attachment_number is a bugzilla attachment with a proposed scoreset # and ham is the minimum percentage of the ham corpus to always show up (def=1) # and diff is a minimum difference between the ham% and the spam% (def=0.05). # Results will include anything that hits the minimum ham OR the minimum diff. my $rules = shift; my $min_ham = shift; my $min_diff = shift; if ( not (defined($rules) ) ) { $rules = 4558; } if ( not (defined($min_ham) ) ) { $min_ham = 1; } if ( not (defined($min_diff) ) ) { $min_diff = 0.05; } my (@output, @scores); my $attach = 'https://issues.apache.org/SpamAssassin/attachment.cgi?id'; open (SCORES, "elinks -source '$attach=$rules' |"); while () { push(@scores, $_); } close SCORES; open (RULEQA, 'elinks -dump "http://ruleqa.spamassassin.org/" |'); # formatted while () { my ($spam, $ham, $ratio, $rule) = /\s+[\d.]+\s+([\d.]+)\s+([\d.]+)(\s+[\d.]+){3}\s+(?!T_)(\w.*)/; next unless ( defined($spam) && ($ham > $min_ham || $ham > $min_diff+$spam) ); my $score = "unknown"; foreach (@scores) { if (/^score +$rule +([^#]+)/) { $score = $1; last; } } chomp ($score); my $fudge = 1; if ($ham > 1) { $fudge = $ham; } else { $fudge = 1; } my $extra = " "; if ($spam) { $ratio = $fudge*$ham/$spam; $extra = "*" if ($ham / $spam < 1); } else { $ratio = $fudge*$ham*100; } if ($ratio !~ /\./) { $ratio .= ".0"; } if ( length($ratio) < 5 ) { $ratio .= "0000"; } $ratio =~ s/^(.....).*/$1$extra/; while ( length($score) < 23 ) { $score .= " "; } if ($ham < 10) { $ham .= " " if (!$ham); $ham = " $ham"; } if ($spam < 10) { $spam .= " " if (!$spam); $spam = " $spam"; } push (@output, "$ratio $ham $spam $score $rule\n"); } close RULEQA; print "H^2/S HAM% SPAM% Score in attachment $rules Rule\n"; print sort { substr($b, 0, 4) <=> substr($a, 0, 4) } @output;