Index: hit-frequencies =================================================================== --- hit-frequencies (revision 9714) +++ hit-frequencies (working copy) @@ -223,34 +223,37 @@ sub readlogs { - my $spam = $ARGV[0] || "spam.log"; - my $ham = $ARGV[1] || (-f "good.log" ? "good.log" : "ham.log"); - foreach my $file ($spam, $ham) { + # argh backwards compat + + my @files; + if (-f "masses.log" || ($ARGV[0] && !$ARGV[1])) { # New style + @files = ("masses.log"); + } + else { + $files[0] = $ARGV[0] || "spam.log"; + $files[1] = $ARGV[1] || (-f "good.log" ? "good.log" : "ham.log"); + } + foreach my $file (@files) { open (IN, "<$file") || die "Could not open file '$file': $!"; - my $isspam = 0; ($file eq $spam) and $isspam = 1; - while () { next if (/^#/); next unless (!$opt_M || /$opt_M/o); next if ($opt_X && /$opt_X/o); /^(.)\s+(-?\d+)\s+(\S+)\s*(\S*)/ or next; - my $caught = ($1 eq 'Y'); + my $spam = (($1 eq 's') || ($1 eq 'N')); + my $false = (($1 eq 'N') || ($1 eq 'P')); my $hits = $2; $_ = $4; s/,,+/,/g; - if ($isspam) { - if ($opt_f) { - if (!$caught) { $num_spam++; } - } else { + if ($spam) { + if (!$opt_f || $false) { $num_spam++; } } else { - if ($opt_f) { - if ($caught) { $num_ham++; } - } else { + if (!$opt_f || $false) { $num_ham++; } } @@ -258,16 +261,12 @@ my @tests = split (/,/, $_); foreach my $t (@tests) { next if ($t eq ''); - if ($isspam) { - if ($opt_f) { - if (!$caught) { $freq_spam{$t}++; } - } else { + if ($spam) { + if (!$opt_f || $false) { $freq_spam{$t}++; } } else { - if ($opt_f) { - if ($caught) { $freq_ham{$t}++; } - } else { + if (!$opt_f || $false) { $freq_ham{$t}++; } } Index: rule-qa/corpus-nightly =================================================================== --- rule-qa/corpus-nightly (revision 9714) +++ rule-qa/corpus-nightly (working copy) @@ -76,5 +76,4 @@ date > test.end # submit results -rsync -CPcvuzb --timeout=120 ham.log $username@rsync.spamassassin.org::corpus/ham-$net$username.log -rsync -CPcvuzb --timeout=120 spam.log $username@rsync.spamassassin.org::corpus/spam-$net$username.log +rsync -CPcvuzb --timeout=120 masses.log $username@rsync.spamassassin.org::corpus/masses-$net$username.log Index: Makefile =================================================================== --- Makefile (revision 9714) +++ Makefile (working copy) @@ -26,7 +26,7 @@ tmp/ranges.data: tmp/.created freqs score-ranges-from-freqs perl score-ranges-from-freqs ../rules $(SCORESET) < freqs -freqs: spam.log ham.log +freqs: masses.log perl hit-frequencies -x -p -s $(SCORESET) > freqs badrules: freqs @@ -36,6 +36,10 @@ -mkdir tmp touch tmp/.created +masses.log: + cat spam.log > masses.log + cat ham.log >> masses.log + clean: rm -rf *.o perceptron tmp freqs Index: mass-check =================================================================== --- mass-check (revision 9714) +++ mass-check (working copy) @@ -20,8 +20,6 @@ die < -j=jobs specify the number of processes to run simultaneously --net turn on network checks! @@ -32,14 +30,21 @@ --showdots print a dot for each scanned message --rules=RE Only test rules matching the given regexp RE --restart=N restart all of the children after processing N messages - + + SpamAssassin options + -c=dir set configuration/rules directory + -p=file set user preferences file + -s=dir set site rules configuration directory + -u=dir set user-state directory + --dist assumes the script is being run from the masses/ dir of + the unpacked tarball, and makes appropriate guesses for -p and -c + log options -o write all logs to stdout --loghits log the text hit for patterns (useful for debugging) --loguris log the URIs found - --hamlog=log use as ham log ('ham.log' is default) - --spamlog=log use as spam log ('spam.log' is default) - + --log=file log to (masses.log is default) + message selection options -n no date sorting or spam/ham interleaving --after=N only test mails received after time_t N (negative values @@ -48,15 +53,17 @@ --all don't skip big messages --head=N only check first N ham and N spam (N messages if -n used) --tail=N only check last N ham and N spam (N messages if -n used) - + simple target options (implies -o and no ham/spam classification) --dir subsequent targets are directories --file subsequent targets are files in RFC 822 format --mbox subsequent targets are mbox files - + Just left over functions we should remove at some point: --bayes report score from Bayesian classifier - + --hamlog=log use as ham log ('ham.log' is default) + --spamlog=log use as spam log ('spam.log' is default) + non-option arguments are used as target names (mail files and folders), the target format is: :: is "spam" or "ham" @@ -69,10 +76,10 @@ ########################################################################### use vars qw($opt_c $opt_p $opt_f $opt_j $opt_n $opt_o $opt_all $opt_bayes - $opt_debug $opt_format $opt_hamlog $opt_head $opt_loghits - $opt_mid $opt_mh $opt_ms $opt_net $opt_nosort $opt_progress + $opt_debug $opt_dist $opt_format $opt_hamlog $opt_head $opt_loghits + $opt_mid $opt_mh $opt_ms $opt_net $opt_nosort $opt_p $opt_progress $opt_s $opt_showdots $opt_spamlog $opt_tail $opt_rules $opt_restart - $opt_loguris $opt_after $opt_rewrite); + $opt_loguris $opt_after $opt_rewrite $opt_u); use FindBin; use lib "$FindBin::Bin/../lib"; @@ -84,22 +91,25 @@ use constant HAS_TIME_PARSEDATE => eval { require Time::ParseDate; }; # default settings -$opt_c = "$FindBin::Bin/../rules"; -$opt_p = "$FindBin::Bin/spamassassin"; + $opt_j = 1; $opt_net = 0; -$opt_hamlog = "ham.log"; -$opt_spamlog = "spam.log"; +$opt_log = "masses.log"; -GetOptions("c=s", "p=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug", - "hamlog=s", "head=i", "loghits", "mh", "mid", "ms", "net", - "progress", "rewrite:s", "showdots", "spamlog=s", "tail=i", - "rules=s", "restart=i", "after=s", "loguris", +GetOptions("c=s", "f=s", "j=i", "n", "o", "all", "bayes", "debug", "dist", + "hamlog=s", "head=i", "log=s", "loghits", "mh", "mid", "ms", + "net", "p=s", "progress", "rewrite:s", "s=s", "showdots", "spamlog=s", "tail=i", + "rules=s", "restart=i", "u=s", "after=s", "loguris", "dir" => sub { $opt_format = "dir"; }, "file" => sub { $opt_format = "file"; }, "mbox" => sub { $opt_format = "mbox"; }, '<>' => \&target) or usage(); +if ($opt_hamlog || $opt_spamlog) { # Old style logging + $opt_hamlog ||= "ham.log"; + $opt_spamlog ||= "spam.log"; +} + if ($opt_f) { open (F, $opt_f) || die $!; push (@targets, map { chomp; $_ } ); @@ -108,6 +118,15 @@ if (scalar @targets == 0) { usage(); } +# Auto-detect --dist option +if (!defined $opt_dist) { + if (-f "$FindBin::Bin/../spamassassin.raw") { + warn "Automatically using --dist. Assuming you are running from the unpacked tarball."; + $opt_dist = 1; + } +} + + #if ($opt_ms) { #find_missed($opt_spamlog); #} @@ -115,37 +134,68 @@ #find_missed($opt_hamlog); #} +my $local_rules_dir; + +if ($opt_dist) { # Set defaults + $opt_c ||= "$FindBin::Bin/../rules"; + $opt_p ||= "$FindBin::Bin/spamassassin/user_prefs"; + $opt_u ||= "$FindBin::Bin/spamassassin"; + $opt_s ||= "$FindBin::Bin/spamassassin"; + $local_rules_dir = ''; +} +else { + if(!$opt_u) { + # Assuming this is OK, since mass-check isnt supported on windows, is it? + # Also, should there be some check to make sure that previous mass-check stuff isn't in there? + # AFAICT, there isn't otherwise.... + if ( -d "${ENV{HOME}}/.spamassassin" ) { + $opt_u = "${ENV{HOME}}/.spamassassin/mass-check"; + mkdir $opt_u, 0700 if (! -d $opt_u); + } + } +} + $spamtest = new Mail::SpamAssassin ({ - 'debug' => $opt_debug, - 'rules_filename' => $opt_c, - 'userprefs_filename' => "$opt_p/user_prefs", - 'site_rules_filename' => "$opt_p/local.cf", - 'userstate_dir' => "$opt_p", - 'save_pattern_hits' => $opt_loghits, - 'dont_copy_prefs' => 1, - 'local_tests_only' => $opt_net ? 0 : 1, - 'only_these_rules' => $opt_rules, - 'ignore_safety_expire_timeout' => 1, - PREFIX => '', - DEF_RULES_DIR => $opt_c, - LOCAL_RULES_DIR => '', -}); + 'debug' => $opt_debug, + 'rules_filename' => $opt_c, + 'userprefs_filename' => $opt_p, + 'site_rules_filename' => $opt_s, + 'userstate_dir' => $opt_u, + 'save_pattern_hits' => $opt_loghits, + 'dont_copy_prefs' => 1, + 'local_tests_only' => $opt_net ? 0 : 1, + 'only_these_rules' => $opt_rules, + 'ignore_safety_expire_timeout' => 1, + DEF_RULES_DIR => $opt_c, + LOCAL_RULES_DIR => $local_rules_dir, + }); $spamtest->compile_now(1); -$spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf"); +if ($opt_dist) { + $spamtest->read_scoreonly_config("$FindBin::Bin/mass-check.cf"); +} my $who = `id -un 2>/dev/null`; chomp $who; my $where = `uname -n 2>/dev/null`; chomp $where; my $when = `date -u`; chomp $when; -my $revision = "unknown"; -if (open(TESTING, "$opt_c/70_testing.cf")) { - chomp($revision = ); - $revision =~ s/.*\$Rev:\s*(\S+).*/$1/; - close(TESTING); +my $revision; + +if ($opt_dist) { + my $rev = "unknown"; + if (open(TESTING, "$opt_c/70_testing.cf")) { + chomp($rev = ); + $rev =~ s/.*\$Rev:\s*(\S+).*/$1/; + close(TESTING); + } + $revision = "SVN revision: $rev"; } +else { + $revision = "Locally generated"; +} + my $log_header = "# mass-check results from $who\@$where, on $when\n" . "# M:SA version ".$spamtest->Version()."\n" . - "# SVN revision: $revision\n"; + "# $revision\n"; my $host = $ENV{'HOSTNAME'} || $ENV{'HOST'} || `hostname` || 'localhost'; chomp $host; @@ -206,7 +256,7 @@ autoflush STDOUT 1; print STDOUT $log_header; } - else { + elsif ($opt_hamlog || $opt_spamlog) { open(HAM, "> $opt_hamlog"); open(SPAM, "> $opt_spamlog"); autoflush HAM 1; @@ -214,6 +264,11 @@ print HAM $log_header; print SPAM $log_header; } + else { + open(OUT, "> $opt_log"); + autoflush OUT 1; + print OUT $log_header; + } $init_results = 1; } @@ -223,25 +278,36 @@ # don't open results files until we get here to avoid overwriting files &init_results if !$init_results; - if ($class eq "s") { - if ($opt_o) { print STDOUT $result; } else { print SPAM $result; } - $spam_count++; + if ($opt_o) { + print STDOUT $result; } - elsif ($class eq "h") { - if ($opt_o) { print STDOUT $result; } else { print HAM $result; } - $ham_count++; + elsif ($opt_spamlog || $opt_hamlog) { + if ($class eq "s") { + print SPAM $result; + } else { + print HAM $result; + } } + else { + print OUT $result; + } $total_count++; #warn ">> result: $total_count $class $time\n"; if ($opt_progress) { + if ($class eq "s") { + $spam_count++; + } + else { + $ham_count++; + } progress($time); } } sub wanted { - my (undef, $id, $time, $dataref) = @_; + my ($class, $id, $time, $dataref) = @_; my $out; my $ma = $spamtest->parse($dataref); @@ -289,18 +355,22 @@ push(@extra, "mid=$mid"); } - my $yorn; + my $type; my $score; my $tests; my $extra; if ($opt_loguris) { - $yorn = '.'; + $type = '.'; $score = 0; $tests = join (" ", sort @uris); $extra = ''; } else { - $yorn = $status->is_spam() ? 'Y' : '.'; + if ($class eq "s") { + $type = $status->is_spam() ? "s" : "N"; # Capitals are falses + } else { + $type = $status->is_spam() ? "P" : "h"; + } $score = $status->get_score(); $tests = join(",", sort(grep(length,$status->get_names_of_tests_hit(),$status->get_names_of_subtests_hit()))); $extra = join(",", @extra); @@ -314,7 +384,7 @@ $id =~ s/\s/_/g; - $out .= sprintf("%s %2d %s %s %s\n", $yorn, $score, $id, $tests, $extra); + $out .= sprintf("%s %2d %s %s %s\n", $type, $score, $id, $tests, $extra); if ($tests =~ /MICROSOFT_EXECUTABLE|MIME_SUSPECT_NAME/) { $out .= logkilled($ma, $id, "possible virus"); Index: logs-to-c =================================================================== --- logs-to-c (revision 9714) +++ logs-to-c (working copy) @@ -18,9 +18,9 @@ use Getopt::Long; use vars qw($opt_cffile $opt_count $opt_lambda $opt_threshold - $opt_spam $opt_nonspam); + $opt_log); -GetOptions("cffile=s", "count", "lambda=f", "threshold=f", "spam=s", "nonspam=s", "scoreset=i"); +GetOptions("cffile=s", "count", "lambda=f", "threshold=f", "log=s", "scoreset=i"); my $argcffile = $opt_cffile; my $justcount = 0; @@ -29,8 +29,9 @@ my $threshold = 5; if (defined $opt_threshold) { $threshold = $opt_threshold; } -$opt_spam ||= 'spam.log'; -$opt_nonspam ||= 'ham.log'; +#$opt_spam ||= 'spam.log'; +#$opt_nonspam ||= 'ham.log'; +$opt_log = 'masses.log'; $opt_scoreset = 0 if ( !defined $opt_scoreset ); my $nybias = 10; @@ -78,68 +79,66 @@ $yyscore = $ynscore = $nyscore = $nnscore = 0.0; } - foreach my $file ($opt_spam, $opt_nonspam) { - open (IN, "<$file"); + open (IN, "<$opt_log"); - while () { - next if /^\#/; - next if /^$/; - if($_ !~ /^.\s+([-\d]+)\s+\S+\s*/) { warn "bad line: $_"; next; } - my $hits = $1; -#my $foo = $_; - $_ = $'; s/(?:bayes|time)=\S+//; s/,,+/,/g; s/^\s+//; s/\s+$//; + while () { + next if /^\#/; + next if /^$/; + if($_ !~ /^(.)\s+([-\d]+)\s+\S+\s*/) { warn "bad line: $_"; next; } + my $type = $1; + my $hits = $2; + $_ = $'; s/(?:bayes|time)=\S+//; s/,,+/,/g; s/^\s+//; s/\s+$//; - my $score = 0; - my @tests = (); - foreach my $tst (split (/,/, $_)) { - next if ($tst eq ''); - if (!defined $scores{$tst}) { - #warn "unknown test in $file, ignored: $tst\n"; - next; - } + my $score = 0; + my @tests = (); + foreach my $tst (split (/,/, $_)) { + next if ($tst eq ''); + if (!defined $scores{$tst}) { + #warn "unknown test in $opt_log, ignored: $tst\n"; + next; + } - # Make sure to skip any subrules! - next if ( $allrules{$tst}->{issubrule} ); + # Make sure to skip any subrules! + next if ( $allrules{$tst}->{issubrule} ); - if ($justcount) { - $score += $scores{$tst}; - } else { - push (@tests, $tst); - } + if ($justcount) { + $score += $scores{$tst}; + } else { + push (@tests, $tst); } + } - if (!$justcount) { - $tests_hit{$count} = \@tests; - } + if (!$justcount) { + $tests_hit{$count} = \@tests; + } - if ($file eq $opt_spam) { - $num_spam++; - if ($justcount) { - if ($score >= $threshold) { - $ga_yy++; $yyscore += $score; - } else { - $ga_yn++; $ynscore += $score; - } - } else { - $is_spam{$count} = 1; - } + if (($type eq 's') || ($type eq 'N')) { + $num_spam++; + if ($justcount) { + if ($score >= $threshold) { + $ga_yy++; $yyscore += $score; + } else { + $ga_yn++; $ynscore += $score; + } } else { - $num_nonspam++; - if ($justcount) { - if ($score >= $threshold) { -#print "$score -- $foo"; - $ga_ny++; $nyscore += $score; - } else { - $ga_nn++; $nnscore += $score; - } - } else { - $is_spam{$count} = 0; - } + $is_spam{$count} = 1; } - $count++; + } else { + $num_nonspam++; + if ($justcount) { + if ($score >= $threshold) { + #print "$score -- $foo"; + $ga_ny++; $nyscore += $score; + } else { + $ga_nn++; $nnscore += $score; + } + } else { + $is_spam{$count} = 0; + } } - close IN; + $count++; } + close IN; $num_tests = $count; }