Lines 16-400
Link Here
|
16 |
# limitations under the License. |
16 |
# limitations under the License. |
17 |
# </@LICENSE> |
17 |
# </@LICENSE> |
18 |
|
18 |
|
|
|
19 |
|
19 |
use FindBin; |
20 |
use FindBin; |
20 |
use Getopt::Std; |
21 |
use lib "$FindBin::Bin/../lib"; |
21 |
getopts("fm:M:X:l:L:pxhc:at:s:i"); |
22 |
use Mail::SpamAssassin::Masses; |
|
|
23 |
use Getopt::Long qw(:config bundling auto_help); |
24 |
use Pod::Usage; |
25 |
use strict; |
26 |
use warnings; |
22 |
|
27 |
|
|
|
28 |
|
23 |
use vars qw { |
29 |
use vars qw { |
24 |
$opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c |
30 |
$opt_f $opt_m $opt_M $opt_X $opt_p $opt_x $opt_h $opt_l $opt_L $opt_c |
25 |
$opt_a $opt_t $opt_s $opt_i $sorting |
31 |
$opt_a $opt_t $opt_s $opt_z $opt_inclang $opt_auto |
26 |
}; |
32 |
}; |
27 |
|
33 |
|
28 |
sub usage { |
34 |
GetOptions("c|cffile=s@" => \$opt_c, |
29 |
die "hit-frequencies [-c rules dir] [-f] [-m RE] [-M RE] [-X RE] [-l LC] |
35 |
"s|scoreset=i" => \$opt_s, # ,, pacify emacs (stupid emacs cperl mode) |
30 |
[-s SC] [-a] [-p] [-x] [-i] [spam log] [ham log] |
36 |
"l|logfile=s" => \$opt_l, |
|
|
37 |
"f|falses" => \$opt_f, |
38 |
"a|all" => \$opt_a, |
39 |
"p|percentages" => \$opt_p, |
40 |
"x|extended" => \$opt_x, |
41 |
"m|matchrule=s" => \$opt_m, #, |
42 |
"t|tflags=s" => \$opt_t, |
43 |
"M|matchlog=s" => \$opt_M, |
44 |
"X|excludelog=s" => \$opt_X, |
45 |
"L|language=s" => \$opt_L, |
46 |
"include-language=s" => \$opt_inclang); |
31 |
|
47 |
|
32 |
-c p use p as the rules directory |
|
|
33 |
-f falses. count only false-negative or false-positive matches |
34 |
-m RE print rules matching regular expression |
35 |
-t RE print rules with tflags matching regular expression |
36 |
-M RE only consider log entries matching regular expression |
37 |
-X RE don't consider log entries matching regular expression |
38 |
-l LC also print language specific rules for lang code LC (or 'all') |
39 |
-L LC only print language specific rules for lang code LC (or 'all') |
40 |
-a display all tests |
41 |
-p percentages. implies -x |
42 |
-x extended output, with S/O ratio and scores |
43 |
-s SC which scoreset to use |
44 |
-i use IG (information gain) for ranking |
45 |
|
48 |
|
46 |
options -l and -L are mutually exclusive. |
49 |
=head1 NAME |
47 |
|
50 |
|
48 |
options -M and -X are *not* mutually exclusive. |
51 |
hit-frequencies - Display statistics about tests hit by a mass-check run |
49 |
|
52 |
|
50 |
if either the spam or and ham logs are unspecified, the defaults |
53 |
=head1 SYNOPSIS |
51 |
are \"spam.log\" and \"ham.log\" in the cwd. |
|
|
52 |
|
54 |
|
53 |
"; |
55 |
hit-frequencies [options] |
54 |
} |
|
|
55 |
|
56 |
|
56 |
usage() if($opt_h || ($opt_l && $opt_L)); |
57 |
Options: |
|
|
58 |
-c,--cffile=path Use path as the rules directory |
59 |
-s,--scoreset=n Use scoreset n |
60 |
-l,--logfile=file Read in file instead of masses.log |
61 |
-f Count only false-positives/false-negatives |
62 |
-a Report all tests (including subrules) |
63 |
-p Report percentages instead of raw hits |
64 |
-x "Extended" output, include RANK, S/O and SCORE |
65 |
-m,--matchrule=re Print rules matching the regular expression |
66 |
-t,--tflags=re Print only rules with tflags matching the regular expression |
67 |
-M,--matchlog=re Consider only logs matching the regular expression |
68 |
-X,--excludelog=re Exclude logs matching this regular expression |
69 |
-L,--language=lc Only print language specific tests for specified lang code (try 'all') |
70 |
--include-language=lc Also print language specific tests for specified lang code (try 'all') |
57 |
|
71 |
|
58 |
if ($opt_p) { |
72 |
=head1 DESCRIPTION |
59 |
$opt_x = 1; |
|
|
60 |
} |
61 |
|
73 |
|
62 |
$opt_s = 0 if ( !defined $opt_s ); |
74 |
B<hit-frequencies> will read the mass-check log F<masses.log> or the |
|
|
75 |
log given by the B<--logfile> option. The output will contain a |
76 |
summary of the number of ham and spam messages and detailed statistics |
77 |
for each rule. By default, B<hit-frequencies> will try to guess the |
78 |
proper values for B<--cffile> based on the header of the |
79 |
masses.log. The output will include the following columns: |
63 |
|
80 |
|
64 |
my $cffile = $opt_c || "$FindBin::Bin/../rules"; |
81 |
=over 4 |
65 |
|
82 |
|
66 |
my %freq_spam = (); |
83 |
=item OVERALL |
67 |
my %freq_ham = (); |
|
|
68 |
my $num_spam = 0; |
69 |
my $num_ham = 0; |
70 |
my %ranking = (); |
71 |
my $ok_lang = ''; |
72 |
|
84 |
|
73 |
readscores($cffile); |
85 |
Number of times (or percentage with B<-p>) the rule hit on |
|
|
86 |
all messages (spam or ham). |
74 |
|
87 |
|
75 |
$ok_lang = lc ($opt_l || $opt_L || ''); |
88 |
=item SPAM |
76 |
if ($ok_lang eq 'all') { $ok_lang = '.'; } |
|
|
77 |
|
89 |
|
78 |
foreach my $key (keys %rules) { |
90 |
Number of times (or percentage with B<-p>) the rule hit on |
|
|
91 |
spam messages. |
79 |
|
92 |
|
80 |
if ( ($opt_L && !$rules{$key}->{lang}) || |
93 |
=item HAM |
81 |
($rules{$key}->{lang} && |
|
|
82 |
(!$ok_lang || $rules{$key}->{lang} !~ /^$ok_lang/i) |
83 |
) ) { |
84 |
delete $rules{$key} ; next; |
85 |
} |
86 |
|
94 |
|
87 |
$freq_spam{$key} = 0; |
95 |
Number of times (or percentage with B<-p>) the rule hit on |
88 |
$freq_ham{$key} = 0; |
96 |
ham messages. |
89 |
} |
|
|
90 |
|
97 |
|
91 |
readlogs(); |
98 |
=item S/O |
92 |
|
99 |
|
93 |
my $hdr_all = $num_spam + $num_ham; |
100 |
Shown only with B<-x> or B<-p>, this is the number of spam hits |
94 |
my $hdr_spam = $num_spam; |
101 |
divided by total number of hits (C<S/O> refers to spam divided by |
95 |
my $hdr_ham = $num_ham; |
102 |
overall). |
96 |
|
103 |
|
97 |
if ($opt_p) { |
104 |
=item RANK |
98 |
my $sorting = $opt_i ? "IG" : "RANK"; |
|
|
99 |
if ($opt_f) { |
100 |
printf "%7s %7s %7s %6s %6s %6s %s\n", |
101 |
"OVERALL%", "FNEG%", "FPOS%", "S/O", $sorting, "SCORE", "NAME"; |
102 |
} else { |
103 |
printf "%7s %7s %7s %6s %6s %6s %s\n", |
104 |
"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME"; |
105 |
} |
106 |
printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", |
107 |
$hdr_all, $hdr_spam, $hdr_ham, |
108 |
soratio ($num_spam,$num_ham), 0, 0; |
109 |
|
105 |
|
110 |
$hdr_spam = ($num_spam / $hdr_all) * 100.0; |
106 |
Shown only with B<-x> or B<-p>, this is a measure that attempts to |
111 |
$hdr_ham = ($num_ham / $hdr_all) * 100.0; |
107 |
indicate how I<good> or I<useful> a test is. The higher it is, the |
112 |
$hdr_all = 100.0; # this is obvious |
108 |
better the test. |
113 |
printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n", |
|
|
114 |
$hdr_all, $hdr_spam, $hdr_ham, |
115 |
soratio ($num_spam,$num_ham), 0, 0; |
116 |
|
109 |
|
117 |
} elsif ($opt_x) { |
110 |
=item SCORE |
118 |
printf "%7s %7s %7s %6s %6s %6s %s\n", |
|
|
119 |
"OVERALL%", "SPAM%", "HAM%", "S/O", $sorting, "SCORE", "NAME"; |
120 |
printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", |
121 |
$hdr_all, $hdr_spam, $hdr_ham, |
122 |
soratio ($num_spam,$num_ham), 0, 0; |
123 |
|
111 |
|
124 |
} else { |
112 |
Shown only with B<-x> or B<-p>, this is the current score assigned to |
125 |
printf "%10s %10s %10s %s\n", |
113 |
the rule. |
126 |
"OVERALL", "SPAM", "HAM", "NAME"; |
|
|
127 |
printf "%10d %10d %10d (all messages)\n", |
128 |
$hdr_all, $hdr_spam, $hdr_ham; |
129 |
} |
130 |
|
114 |
|
131 |
my %done = (); |
115 |
=item NAME |
132 |
my @tests = (); |
|
|
133 |
my $rank_hi = 0; |
134 |
my $rank_lo = 9999999; |
135 |
|
116 |
|
136 |
# variables for wanted/unwanted RANK |
117 |
This is the rule's name. |
137 |
my %wanted; |
|
|
138 |
my %unwanted; |
139 |
my %wranks; |
140 |
my %uranks; |
141 |
|
118 |
|
142 |
foreach my $test (keys %freq_spam, keys %freq_ham) { |
119 |
=back |
143 |
next unless (exists $rules{$test}); # only valid tests |
|
|
144 |
next if (!$opt_a && $rules{$test}->{issubrule}); |
145 |
|
120 |
|
146 |
next if $done{$test}; $done{$test} = 1; |
121 |
=head1 BUGS |
147 |
push (@tests, $test); |
|
|
148 |
|
122 |
|
149 |
my $isnice = 0; |
123 |
Please report bugs to http://bugzilla.spamassassin.org/ |
150 |
if ($rules{$test}->{tflags} =~ /nice/) { $isnice = 1; } |
|
|
151 |
|
124 |
|
152 |
my $fs = $freq_spam{$test}; $fs ||= 0; |
125 |
=head1 SEE ALSO |
153 |
my $fn = $freq_ham{$test}; $fn ||= 0; |
|
|
154 |
my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0; |
155 |
my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0; |
156 |
|
126 |
|
157 |
my $soratio = $soratio{$test} = soratio ($fsadj, $fnadj); |
127 |
L<mass-check(1)>, L<Mail::SpamAssassin::Masses(3)>, L<perceptron(1)> |
158 |
|
128 |
|
159 |
if ($isnice) { |
129 |
=cut |
160 |
$soratio = 1.0 - $soratio; |
|
|
161 |
my $tmp = $fsadj; $fsadj = $fnadj; $fnadj = $tmp; |
162 |
} |
163 |
|
130 |
|
164 |
if ($opt_i) { |
131 |
if ($opt_L && $opt_inclang) { |
165 |
# come up with a ranking |
132 |
pod2usage("-L/--language and --include-language are mutually exclusive"); |
166 |
my $rank; |
|
|
167 |
|
168 |
# New new system: from "Learning to Filter Unsolicited Commercial E-Mail", |
169 |
# Ion Androutsopoulos et al: determine the information gain IG(X, C) of the |
170 |
# Boolean attributes (ie. the rules). Measures "the average reduction in |
171 |
# the entropy of C (classification) given the value of X (the rule)". Makes |
172 |
# a good ranking measure with a proper statistical basis. ;) |
173 |
# |
174 |
# Still would like to get an entropy measure in, too. |
175 |
# |
176 |
# sum P(X = x ^ C = c) |
177 |
# IG(X,C) = x in [0, 1] P(X = x ^ C = c) . log2( ------------------- ) |
178 |
# c in [Ch, Cs] P(X = x) . P(C = c) |
179 |
# |
180 |
my $safe_nspam = $num_spam || 0.0000001; |
181 |
my $safe_nham = $num_ham || 0.0000001; |
182 |
|
183 |
my $num_all = ($num_spam + $num_ham); |
184 |
my $safe_all = $num_all || 0.0000001; |
185 |
my $f_all = $fs+$fn; |
186 |
|
187 |
my $px0 = (($num_all - $f_all) / $safe_all); # P(X = 0) |
188 |
my $px1 = ($f_all / $safe_all); # P(X = 1) |
189 |
my $pccs = ($num_spam / $safe_all); # P(C = Cs) |
190 |
my $pcch = ($num_ham / $safe_all); # P(C = Ch) |
191 |
my $px1ccs = ($fs / $safe_nspam); # P(X = 1 ^ C = Cs) |
192 |
my $px1cch = ($fn / $safe_nham); # P(X = 1 ^ C = Ch) |
193 |
my $px0ccs = (($num_spam - $fs) / $safe_nspam); # P(X = 0 ^ C = Cs) |
194 |
my $px0cch = (($num_ham - $fn) / $safe_nham); # P(X = 0 ^ C = Ch) |
195 |
my $safe_px0_dot_pccs = ($px0 * $pccs) || 0.00000001; |
196 |
my $safe_px0_dot_pcch = ($px0 * $pcch) || 0.00000001; |
197 |
my $safe_px1_dot_pccs = ($px1 * $pccs) || 0.00000001; |
198 |
my $safe_px1_dot_pcch = ($px1 * $pcch) || 0.00000001; |
199 |
|
200 |
sub log2 { return log($_[0]) / 0.693147180559945; } # log(2) = 0.6931... |
201 |
|
202 |
my $safe_px0ccs = ($px0ccs || 0.0000001); |
203 |
my $safe_px0cch = ($px0cch || 0.0000001); |
204 |
my $safe_px1ccs = ($px1ccs || 0.0000001); |
205 |
my $safe_px1cch = ($px1cch || 0.0000001); |
206 |
$rank = ( $px0ccs * log2($safe_px0ccs / $safe_px0_dot_pccs) ) + |
207 |
( $px0cch * log2($safe_px0cch / $safe_px0_dot_pcch) ) + |
208 |
( $px1ccs * log2($safe_px1ccs / $safe_px1_dot_pccs) ) + |
209 |
( $px1cch * log2($safe_px1cch / $safe_px1_dot_pcch) ); |
210 |
|
211 |
$ranking{$test} = $rank; |
212 |
$rank_hi = $rank if ($rank > $rank_hi); |
213 |
$rank_lo = $rank if ($rank < $rank_lo); |
214 |
} |
215 |
else { |
216 |
# basic wanted/unwanted ranking |
217 |
$wanted{$test} = $isnice ? $fn : $fs; |
218 |
$unwanted{$test} = $isnice ? $fs : $fn; |
219 |
# count number of ranks of each type |
220 |
$wranks{$wanted{$test}} = 1; |
221 |
$uranks{$unwanted{$test}} = 1; |
222 |
} |
223 |
} |
133 |
} |
224 |
|
134 |
|
225 |
# finish basic wanted/unwanted ranking |
135 |
if ($opt_p) { |
226 |
if (! $opt_i) { |
136 |
$opt_x = 1; |
227 |
my @wanted = sort { $wanted{$a} <=> $wanted{$b} } keys %wanted; |
|
|
228 |
my @unwanted = sort { $unwanted{$b} <=> $unwanted{$a} } keys %unwanted; |
229 |
|
230 |
# first half of ranking is the wanted rank |
231 |
my $position = 0; |
232 |
my $last = undef; |
233 |
for my $test (@wanted) { |
234 |
$position++ if defined $last && $last != $wanted{$test}; |
235 |
$ranking{$test} += $position; |
236 |
$last = $wanted{$test} |
237 |
} |
238 |
|
239 |
# second half of ranking is the unwanted rank |
240 |
my $normalize = (scalar keys %wranks) / (scalar keys %uranks); |
241 |
$position = 0; |
242 |
$last = undef; |
243 |
for my $test (@unwanted) { |
244 |
$position++ if defined $last && $last != $unwanted{$test}; |
245 |
$ranking{$test} += ($position * $normalize); |
246 |
$last = $unwanted{$test}; |
247 |
$rank_hi = $ranking{$test} if ($ranking{$test} > $rank_hi); |
248 |
$rank_lo = $ranking{$test} if ($ranking{$test} < $rank_lo); |
249 |
} |
250 |
} |
137 |
} |
251 |
|
138 |
|
252 |
{ |
139 |
$opt_s = 0 if ( !defined $opt_s ); |
253 |
# now normalise the rankings to [0, 1] |
|
|
254 |
$rank_hi -= $rank_lo; |
255 |
foreach $test (@tests) { |
256 |
$ranking{$test} = $rank_hi == 0 ? 0.001 : ($ranking{$test} - $rank_lo) / ($rank_hi); |
257 |
} |
258 |
} |
259 |
|
140 |
|
260 |
foreach $test (sort { $ranking{$b} <=> $ranking{$a} } @tests) { |
141 |
my $ok_lang = lc ( $opt_inclang || $opt_L || ''); |
261 |
next unless (exists $rules{$test}); # only valid tests |
142 |
$ok_lang = '.' if ($ok_lang eq 'all'); |
262 |
next if (!$opt_a && $rules{$test}->{issubrule}); |
|
|
263 |
|
143 |
|
264 |
my $fs = $freq_spam{$test}; $fs ||= 0; |
144 |
my $greprules = sub { # To determine whether rule should be read |
265 |
my $fn = $freq_ham{$test}; $fn ||= 0; |
145 |
my ($name, $rule) = @_; |
266 |
my $fa = $fs+$fn; |
|
|
267 |
|
146 |
|
268 |
next if ($opt_m && $test !~ m/$opt_m/); # match certain tests |
147 |
return 0 if ($opt_m && $name !~ /$opt_m/); # name doesn't match -m |
269 |
next if ($opt_t && $rules{$test}->{tflags} !~ /$opt_t/); # match tflags |
148 |
# expression |
|
|
149 |
return 0 if ($opt_t && $rule->{tflags} !~ /$opt_t/); # tflags don't |
150 |
# match -t |
151 |
# expression |
152 |
return 0 if (($opt_L && !$rule->{lang}) || |
153 |
($rule->{lang} && |
154 |
(!$ok_lang || $rule->{lang} !~ /^$ok_lang/i))); # Wrong language |
270 |
|
155 |
|
|
|
156 |
return 0 if ($rule->{issubrule} && !$opt_a); |
157 |
|
271 |
if (!$opt_a && !$opt_t) { |
158 |
if (!$opt_a && !$opt_t) { |
272 |
next if ($rules{$test}->{tflags} =~ /net/ && ($opt_s % 2 == 0)); # not net tests |
159 |
return 0 if ($rule->{tflags} =~ /net/ && ($opt_s % 2 == 0)); |
273 |
next if ($rules{$test}->{tflags} =~ /userconf/); # or userconf |
160 |
return 0 if ($rule->{tflags} =~ /userconf/); # or userconf |
274 |
} |
161 |
} |
|
|
162 |
return 1; |
275 |
|
163 |
|
276 |
# adjust based on corpora sizes (and cvt to % while we're at it) |
164 |
}; |
277 |
my $fsadj = $num_spam == 0 ? 0 : ($fs / ($num_spam)) * 100.0; |
|
|
278 |
my $fnadj = $num_ham == 0 ? 0 : ($fn / ($num_ham)) * 100.0; |
279 |
|
165 |
|
280 |
if ($opt_f && $fsadj == 0 && $fnadj == 0) { next; } |
|
|
281 |
|
166 |
|
282 |
if ($opt_p) { |
167 |
my $logfile = $opt_l || "masses.log"; |
283 |
$fa = ($fa / ($num_spam + $num_ham)) * 100.0; |
|
|
284 |
$fs = $fsadj; |
285 |
$fn = $fnadj; |
286 |
} |
287 |
|
168 |
|
288 |
my $soratio = $soratio{$test}; |
169 |
if (!$opt_c || !scalar(@$opt_c)) { |
289 |
if (!defined $soratio) { |
170 |
# Try to read this in from the log, if possible |
290 |
$soratio{$test} = soratio ($fsadj, $fnadj); |
171 |
open IN, $logfile or die "Can't open $logfile: $!"; |
291 |
} |
172 |
my $files = 0; # are we in the files section? |
|
|
173 |
while(<IN>) { |
174 |
if (!$files) { |
175 |
if (/^\# SVN revision:/) { |
176 |
$opt_c = [ "$FindBin::Bin/../rules" ]; |
177 |
last; |
178 |
} elsif (/^\# Using configuration:$/) { |
179 |
$files = 1; |
180 |
} |
181 |
} elsif (/^\#\s+(.*)\s*$/) { |
182 |
push (@$opt_c, $1); |
183 |
} else { |
184 |
# All done! |
185 |
last; |
186 |
} |
187 |
} |
292 |
|
188 |
|
293 |
if ($opt_p) { |
189 |
foreach my $file (@$opt_c) { |
294 |
printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f %s\n", |
190 |
die "Can't read $file" unless -r $file; |
295 |
$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test; |
191 |
} |
296 |
|
|
|
297 |
} elsif ($opt_x) { |
298 |
printf "%7d %7d %7d %7.3f %6.2f %6.2f %s\n", |
299 |
$fa, $fs, $fn, $soratio, $ranking{$test}, $scores{$test}, $test; |
300 |
|
301 |
} else { |
302 |
printf "%10d %10d %10d %s\n", $fa, $fs, $fn, $test; |
303 |
} |
304 |
} |
192 |
} |
305 |
exit; |
193 |
|
|
|
194 |
my $masses = Mail::SpamAssassin::Masses->new({ rulesdir => $opt_c, |
195 |
scoreset => $opt_s, |
196 |
falsesonly => $opt_f, |
197 |
greprules => $greprules, |
198 |
logfile => $logfile, |
199 |
nologs => 1}); |
306 |
|
200 |
|
|
|
201 |
$masses->readrules(); |
202 |
$masses->readlogs(); |
203 |
$masses->do_statistics(); |
204 |
$masses->do_rank(); |
307 |
|
205 |
|
|
|
206 |
my $rules = $masses->get_rules_hash(); |
207 |
my $num_ham = $masses->get_num_ham(); |
208 |
my $num_spam = $masses->get_num_spam(); |
209 |
my $num_all = $num_ham + $num_spam; |
308 |
|
210 |
|
309 |
sub readlogs { |
211 |
if ($num_ham + $num_spam <= 0) { |
310 |
my $spam = $ARGV[0] || "spam.log"; |
212 |
die "Can't run hit-frequencies on 0 messages."; |
311 |
my $ham = $ARGV[1] || (-f "good.log" ? "good.log" : "ham.log"); |
213 |
} |
312 |
|
214 |
|
313 |
foreach my $file ($spam, $ham) { |
215 |
## Write header |
314 |
open (IN, "<$file") || die "Could not open file '$file': $!"; |
|
|
315 |
|
216 |
|
316 |
my $isspam = 0; ($file eq $spam) and $isspam = 1; |
217 |
if ($opt_p) { |
317 |
|
218 |
|
318 |
while (<IN>) { |
219 |
if ($opt_f) { |
319 |
next if (/^#/); |
220 |
printf "%7s %7s %7s %6s %6s %6s %s\n", |
320 |
next unless (!$opt_M || /$opt_M/o); |
221 |
"OVERALL%", "FNEG%", "FPOS%", "S/O", "RANK", "SCORE", "NAME"; |
321 |
next if ($opt_X && /$opt_X/o); |
222 |
} else { |
|
|
223 |
printf "%7s %7s %7s %6s %6s %6s %s\n", |
224 |
"OVERALL%", "SPAM%", "HAM%", "S/O", "RANK", "SCORE", "NAME"; |
225 |
} |
322 |
|
226 |
|
323 |
/^(.)\s+(-?\d+)\s+(\S+)\s*(\S*)/ or next; |
227 |
printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", |
324 |
my $caught = ($1 eq 'Y'); |
228 |
$num_all, $num_spam, $num_ham, |
325 |
my $hits = $2; |
229 |
$num_spam / $num_all, 0, 0; |
326 |
$_ = $4; s/,,+/,/g; |
|
|
327 |
|
230 |
|
328 |
if ($isspam) { |
231 |
printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f (all messages as %%)\n", |
329 |
if ($opt_f) { |
232 |
100.0, $num_spam / $num_all * 100.0, $num_ham / $num_all * 100.0, |
330 |
if (!$caught) { $num_spam++; } |
233 |
$num_spam / $num_all, 0, 0; |
331 |
} else { |
|
|
332 |
$num_spam++; |
333 |
} |
334 |
} else { |
335 |
if ($opt_f) { |
336 |
if ($caught) { $num_ham++; } |
337 |
} else { |
338 |
$num_ham++; |
339 |
} |
340 |
} |
341 |
|
234 |
|
342 |
my @tests = split (/,/, $_); |
235 |
} elsif ($opt_x) { |
343 |
foreach my $t (@tests) { |
236 |
printf "%7s %7s %7s %6s %6s %6s %s\n", |
344 |
next if ($t eq ''); |
237 |
"OVERALL", "SPAM", "HAM", "S/O", "RANK", "SCORE", "NAME"; |
345 |
if ($isspam) { |
238 |
printf "%7d %7d %7d %7.3f %6.2f %6.2f (all messages)\n", |
346 |
if ($opt_f) { |
239 |
$num_all, $num_spam, $num_ham, |
347 |
if (!$caught) { $freq_spam{$t}++; } |
240 |
$num_spam / $num_all, 0, 0; |
348 |
} else { |
|
|
349 |
$freq_spam{$t}++; |
350 |
} |
351 |
} else { |
352 |
if ($opt_f) { |
353 |
if ($caught) { $freq_ham{$t}++; } |
354 |
} else { |
355 |
$freq_ham{$t}++; |
356 |
} |
357 |
} |
358 |
} |
359 |
} |
360 |
close IN; |
361 |
} |
362 |
} |
363 |
|
241 |
|
364 |
|
242 |
} else { |
365 |
sub readscores { |
243 |
printf "%10s %10s %10s %s\n", |
366 |
my($cffile) = @_; |
244 |
"OVERALL", "SPAM", "HAM", "NAME"; |
367 |
system ("$FindBin::Bin/parse-rules-for-masses -d \"$cffile\" -s $opt_s") and die; |
245 |
printf "%10d %10d %10d (all messages)\n", |
368 |
require "./tmp/rules.pl"; |
246 |
$num_all, $num_spam, $num_ham; |
369 |
} |
247 |
} |
370 |
|
248 |
|
371 |
sub soratio { |
249 |
foreach my $test (sort { $rules->{$b}->{rank} <=> $rules->{$a}->{rank} } keys %{$rules}) { |
372 |
my ($s, $n) = @_; |
|
|
373 |
|
250 |
|
374 |
$s ||= 0; |
251 |
if ($opt_p) { |
375 |
$n ||= 0; |
252 |
printf "%7.3f %7.4f %7.4f %7.3f %6.2f %6.2f %s\n", |
376 |
|
253 |
($rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham}) / $num_all * 100.0, |
377 |
if ($s + $n > 0) { |
254 |
$rules->{$test}->{spam_percent}, $rules->{$test}->{ham_percent}, |
378 |
return $s / ($s + $n); |
255 |
$rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test; |
|
|
256 |
} elsif ($opt_x) { |
257 |
printf "%7d %7d %7d %7.3f %6.2f %6.2f %s\n", |
258 |
$rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham}, |
259 |
$rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham}, |
260 |
$rules->{$test}->{soratio}, $rules->{$test}->{rank}, $rules->{$test}->{score}, $test; |
379 |
} else { |
261 |
} else { |
380 |
return 0.5; # no results -> not effective |
262 |
printf "%10d %10d %10d %s\n", |
|
|
263 |
$rules->{$test}->{freq_spam} + $rules->{$test}->{freq_ham}, |
264 |
$rules->{$test}->{freq_spam}, $rules->{$test}->{freq_ham}, $test; |
381 |
} |
265 |
} |
382 |
} |
266 |
} |
383 |
|
267 |
|
384 |
sub tcr { |
|
|
385 |
my ($nspam, $nlegit, $nspamspam, $nlegitspam) = @_; |
386 |
my $nspamlegit = $nspam - $nspamspam; |
387 |
my $nlegitlegit = $nlegit - $nlegitspam; |
388 |
|
389 |
my $lambda = 99; |
390 |
|
391 |
my $werr = ($lambda * $nlegitspam + $nspamlegit) |
392 |
/ ($lambda * $nlegit + $nspam); |
393 |
|
394 |
my $werr_base = $nspam |
395 |
/ ($lambda * $nlegit + $nspam); |
396 |
|
397 |
$werr ||= 0.000001; # avoid / by 0 |
398 |
my $tcr = $werr_base / $werr; |
399 |
return $tcr; |
400 |
} |