diff -urN SpamAssassin-2.21.orig/SpamAssassin/Conf.pm SpamAssassin-2.21-realrbl-rbltimeout-timelog/SpamAssassin/Conf.pm --- SpamAssassin-2.21.orig/SpamAssassin/Conf.pm Tue May 7 17:32:58 2002 +++ SpamAssassin-2.21-realrbl-rbltimeout-timelog/SpamAssassin/Conf.pm Tue Jun 4 23:54:29 2002 @@ -45,8 +45,9 @@ use vars qw{ @ISA $type_body_tests $type_head_tests $type_head_evals $type_body_evals $type_full_tests $type_full_evals - $type_rawbody_tests $type_rawbody_evals - $type_uri_tests $type_uri_evals + $type_rawbody_tests $type_rawbody_evals + $type_uri_tests $type_uri_evals + $type_rbl_evals $type_rbl_res_evals }; @ISA = qw(); @@ -61,6 +62,8 @@ $type_rawbody_evals = 108; $type_uri_tests = 109; $type_uri_evals = 110; +$type_rbl_evals = 120; +$type_rbl_res_evals = 121; ########################################################################### @@ -98,7 +101,19 @@ $self->{terse_report_template} = ''; $self->{spamtrap_template} = ''; + # What different RBLs consider a dialup IP -- Marc + $self->{dialup_codes} = { + "dialups.mail-abuse.org." => "127.0.0.3", + # For DUL + other codes, we ignore that it's on DUL + "rbl-plus.mail-abuse.org." => "127.0.0.2", + "relays.osirusoft.com." => "127.0.0.3", + }; + + $self->{num_check_received} = 2; + $self->{razor_config} = $main->sed_path ("~/razor.conf"); + $self->{razor_timeout} = 10; + $self->{rbl_timeout} = 30; # this will be sedded by whitelist implementations, so ~ is OK $self->{auto_whitelist_path} = "~/.spamassassin/auto-whitelist"; @@ -122,6 +137,7 @@ $self->{dcc_fuz1_max} = 999999; $self->{dcc_fuz2_max} = 999999; $self->{dcc_add_header} = 0; + $self->{dcc_timeout} = 10; $self->{whitelist_from} = { }; $self->{blacklist_from} = { }; @@ -394,6 +410,19 @@ $self->{check_mx_delay} = $1+0; next; } +=item rbl_timeout n (default 30) + +All RBL queries are started at the beginning and we try to read the results +at the end. In case some of them are hanging or not returning, you can specify +here how long you're willing to wait for them before deciding that they timed +out + +=cut + + if (/^rbl[-_]timeout\s+(\d+)$/) { + $self->{rbl_timeout} = $1+0; next; + } + =item ok_locales xx [ yy zz ... ] (default: en) Which locales (country codes) are considered OK to receive mail from. Mail @@ -606,12 +635,113 @@ $self->{dcc_add_header} = $1+0; next; } +=item dcc_timeout n (default: 10) + +How many seconds you wait for dcc to complete before you go on without +the results + +=cut + + if (/^dcc[-_]timeout\s*(\d+)\s*$/) { + $self->{dcc_timeout} = $1+0; next; + } + + +=item num_check_received { integer } (default: 2) + +How many received lines from and including the original mail relay +do we check in RBLs (you'd want at least 1 or 2). +Note that for checking against dialup lists, you can call check_rbl +with a special set name of "set-firsthop" and this rule will only +be matched against the first hop if there is more than one hop, so +that you can set a negative score to not penalize people who properly +relayed through their ISP. +See dialup_codes for more details and an example + +=cut + + if (/^num[-_]check[-_]received\s+(\d+)$/) { + $self->{num_check_received} = $1+0; next; + } ########################################################################### # SECURITY: no eval'd code should be loaded before this line. # if ($scoresonly && !$self->{allow_user_rules}) { goto failed_line; } + +# If you think, this is complex, you should have seen the four previous +# implementations that I scratched :-) +# Once you understand this, you'll see it's actually quite flexible -- Marc + +=item dialup_codes { "domain1" => "127.0.x.y", "domain2" => "127.0.a.b" } + +Default: +{ "dialups.mail-abuse.org." => "127.0.0.3", +# For DUL + other codes, we ignore that it's on DUL + "rbl-plus.mail-abuse.org." => "127.0.0.2", + "relays.osirusoft.com." => "127.0.0.3" }; + +WARNING!!! When passing a reference to a hash, you need to put the whole hash in +one line for the parser to read it correctly (you can check with spamassassin -D +< mesg) + +Set this to what your RBLs return for dialup IPs +It is used by dialup-firsthop and relay-firsthop rules so that you can match +DUL codes and compensate DUL checks with a negative score if the IP is a dialup +IP the mail originated from and it was properly relayed by a hop before reaching +you (hopefully not your secondary MX :-D) +The trailing "-firsthop" is magic, it's what triggers the RBL to only be run +on the originating hop +The idea is to not penalize (or penalize less) people who properly relayed +through their ISP's mail server + +Here's an example showing the use of Osirusoft and MAPS DUL, as well as the use +of check_two_rbl_results to compensate for a match in both RBLs + +header RCVD_IN_DUL rbleval:check_rbl('dialup', 'dialups.mail-abuse.org.') +describe RCVD_IN_DUL Received from dialup, see http://www.mail-abuse.org/dul/ +score RCVD_IN_DUL 4 + +header X_RCVD_IN_DUL_FH rbleval:check_rbl('dialup-firsthop', 'dialups.mail-abuse.org.') +describe X_RCVD_IN_DUL_FH Received from first hop dialup, see http://www.mail-abuse.org/dul/ +score X_RCVD_IN_DUL_FH -3 + +header RCVD_IN_OSIRUSOFT_COM rbleval:check_rbl('osirusoft', 'relays.osirusoft.com.') +describe RCVD_IN_OSIRUSOFT_COM Received via an IP flagged in relays.osirusoft.com + +header X_OSIRU_SPAM_SRC rbleval:check_rbl_results_for('osirusoft', '127.0.0.4') +describe X_OSIRU_SPAM_SRC DNSBL: sender is Confirmed Spam Source, penalizing further +score X_OSIRU_SPAM_SRC 3.0 + +header X_OSIRU_SPAMWARE_SITE rbleval:check_rbl_results_for('osirusoft', '127.0.0.6') +describe X_OSIRU_SPAMWARE_SITE DNSBL: sender is a Spamware site or vendor, penalizing further +score X_OSIRU_SPAMWARE_SITE 5.0 + +header X_OSIRU_DUL_FH rbleval:check_rbl('osirusoft-dul-firsthop', 'relays.osirusoft.com.') +describe X_OSIRU_DUL_FH Received from first hop dialup listed in relays.osirusoft.com +score X_OSIRU_DUL_FH -1.5 + +header Z_FUDGE_DUL_MAPS_OSIRU rblreseval:check_two_rbl_results('osirusoft', "127.0.0.3", 'dialup', "127.0.0.3") +describe Z_FUDGE_DUL_MAPS_OSIRU Do not double penalize for MAPS DUL and Osirusoft DUL +score Z_FUDGE_DUL_MAPS_OSIRU -2 + +header Z_FUDGE_RELAY_OSIRU rblreseval:check_two_rbl_results('osirusoft', "127.0.0.2", 'relay', "127.0.0.2") +describe Z_FUDGE_RELAY_OSIRU Do not double penalize for being an open relay on Osirusoft and another DNSBL +score Z_FUDGE_RELAY_OSIRU -2 + +header Z_FUDGE_DUL_OSIRU_FH rblreseval:check_two_rbl_results('osirusoft-dul-firsthop', "127.0.0.3", 'dialup-firsthop', "127.0.0.3") +describe Z_FUDGE_DUL_OSIRU_FH Do not double compensate for MAPS DUL and Osirusoft DUL first hop dialup +score Z_FUDGE_DUL_OSIRU_FH 1.5 + +=cut + + if (/^dialup_codes\s+(.*)$/) { + $self->{dialup_codes} = eval $1; + next; + } + + =back =head1 SETTINGS @@ -664,8 +794,21 @@ are optional arguments to the function call. =cut + if (/^header\s+(\S+)\s+rbleval:(.*)$/) { + $self->add_test ($1, $2, $type_rbl_evals); next; + } + if (/^header\s+(\S+)\s+rblreseval:(.*)$/) { + $self->add_test ($1, $2, $type_rbl_res_evals); next; + } + if (/^header\s+(\S+)\s+eval:(.*)$/) { - $self->add_test ($1, $2, $type_head_evals); next; + my ($name,$rule) = ($1, $2); + # Backward compatibility with old rule names -- Marc + if ($name =~ /^RCVD_IN/) { + $self->add_test ($name, $rule, $type_rbl_evals); next; + } else { + $self->add_test ($name, $rule, $type_head_evals); next; + } } if (/^header\s+(\S+)\s+(.*)$/) { $self->add_test ($1, $2, $type_head_tests); next; @@ -782,6 +925,17 @@ $self->{razor_config} = $1; next; } +=item razor_timeout n (default 10) + +How many seconds you wait for razor to complete before you go on without +the results + +=cut + + if (/^razor[-_]timeout\s*(\d+)\s*$/) { + $self->{razor_timeout} = $1; next; + } + =item dcc_options options Specify additional options to the dccproc(8) command. Please note that only @@ -809,6 +963,22 @@ $self->{auto_whitelist_path} = $1; next; } +=item timelog_path /path/to/dir (default: NULL) + +If you set this value, razor will try to create logfiles for each message I +processes and dump information on how fast it ran, and in which parts of the +code the time was spent. +The files will be named: unixdate_mesgid (i.e 1023257504_chuvn31gdu@4ax.com) + +Make sure SA can write the log file, if you're not sure what permissions +needed, make the log directory chmod'ed 1777, and adjust later. + +=cut + + if (/^timelog[-_]path\s*(.*)\s*$/) { + $Mail::SpamAssassin::TIMELOG->{logpath}=$1; next; + } + =item auto_whitelist_file_mode (default: 0700) The file mode bits used for the automatic-whitelist directory or file. @@ -915,6 +1085,8 @@ my $text = $self->{tests}->{$name}; if ($type == $type_body_tests) { $self->{body_tests}->{$name} = $text; } + elsif ($type == $type_rbl_evals) { $self->{rbl_evals}->{$name} = $text; } + elsif ($type == $type_rbl_res_evals) { $self->{rbl_res_evals}->{$name} = $text; } elsif ($type == $type_head_tests) { $self->{head_tests}->{$name} = $text; } elsif ($type == $type_head_evals) { $self->{head_evals}->{$name} = $text; } elsif ($type == $type_body_evals) { $self->{body_evals}->{$name} = $text; } diff -urN SpamAssassin-2.21.orig/SpamAssassin/Dns.pm SpamAssassin-2.21-realrbl-rbltimeout-timelog/SpamAssassin/Dns.pm --- SpamAssassin-2.21.orig/SpamAssassin/Dns.pm Tue May 7 17:32:58 2002 +++ SpamAssassin-2.21-realrbl-rbltimeout-timelog/SpamAssassin/Dns.pm Wed Jun 5 00:26:34 2002 @@ -54,31 +54,98 @@ ########################################################################### sub do_rbl_lookup { - my ($self, $set, $dom, $ip, $found) = @_; + my ($self, $set, $dom, $ip, $found, $dialupreturn, $needresult) = @_; + my $socket; + my @addr=(); + my $maxwait=$self->{conf}->{rbl_timeout}; return $found if $found; - my $q = $self->{res}->search ($dom); + my $gotdialup=0; + my $domainonly; + ($domainonly = $dom) =~ s/^\d+\.\d+\.\d+\.\d+.//; + $domainonly =~ s/\.?$/./; + + if (defined $self->{dnscache}->{rbl}->{$dom}->{result}) { + dbg("Found $dom in our DNS cache. Yeah!", "rbl", -1); + @addr = @{$self->{dnscache}->{rbl}->{$dom}->{result}}; + } elsif (not defined $self->{dnscache}->{rbl}->{$dom}->{socket}) { + dbg("Launching DNS query for $dom in the background", "rbl", -1); + $self->{dnscache}->{rbl}->{$dom}->{socket}=$self->{res}->bgsend($dom); + $self->{dnscache}->{rbl}->{$dom}->{time}=time; + return 0; + } elsif (not $needresult) { + dbg("Second batch query for $dom, ignoring since we have one pending", "rbl", -1); + return 0; + } else { + timelog("RBL -> Waiting for result on $dom", "rbl", 1); + $socket=$self->{dnscache}->{rbl}->{$dom}->{socket}; + + while (not $self->{res}->bgisready($socket)) { + last if (time - $self->{dnscache}->{rbl}->{$dom}->{time} > $maxwait); + sleep 1; + } + + if (not $self->{res}->bgisready($socket)) { + timelog("RBL -> Timeout on $dom", "rbl", 2); + dbg("Query for $dom timed out after $maxwait seconds", "rbl", -1); + return 0; + } else { + my $packet = $self->{res}->bgread($socket); + undef($socket); + foreach $_ ($packet->answer) { + dbg("Query for $dom yielded: ".$_->rdatastr, "rbl", -2); + if ($_->type eq "A") { + push(@addr, $_->rdatastr); + } + } + $self->{dnscache}->{rbl}->{$dom}->{result} = \@addr; + } + } + + if (@addr) { + foreach my $addr (@addr) { + + # 127.0.0.2 is the traditional boolean indicator, don't log it + # 127.0.0.3 now also means "is a dialup IP" (only if set is dialup + # -- Marc) + if ($addr ne '127.0.0.2' and + not ($addr eq '127.0.0.3' and $set =~ /^dialup/)) { + $self->test_log ("RBL check: found ".$dom.", type: ".$addr); + } else { + $self->test_log ("RBL check: found ".$dom); + } + dbg("RBL check: found $dom, type: $addr", "rbl", -2); + + $self->{$set}->{rbl_IN_As_found} .= $addr.' '; + $self->{$set}->{rbl_matches_found} .= $ip.' '; - if ($q) { - foreach my $rr ($q->answer) { - if ($rr->type eq "A") { - my $addr = $rr->address(); - dbg ("record found for $dom = $addr"); - - if ($addr ne '127.0.0.2' && $addr ne '127.0.0.3') { - $self->test_log ("RBL check: found ".$dom.", type: ".$addr); - } else { - # 127.0.0.2 is the traditional boolean indicator, don't log it - # 127.0.0.3 now also means "is a dialup IP" - $self->test_log ("RBL check: found ".$dom); + # If $dialupreturn is a reference to a hash, we were told to ignore + # dialup IPs, let's see if we have a match + if ($dialupreturn) { + my $toign; + dbg("Checking dialup_codes for $addr as a DUL code for $domainonly", "rbl", -2); + + foreach $toign (keys %{$dialupreturn}) { + dbg("Comparing against $toign/".$dialupreturn->{$toign}, "rbl", -3); + $toign =~ s/\.?$/./; + if ($domainonly eq $toign and $addr eq $dialupreturn->{$toign}) { + dbg("Got $addr in $toign for $ip, good, we'll take it", "rbl", "-3"); + $gotdialup=1; + last; + } } - $self->{$set}->{rbl_IN_As_found} .= $addr.' '; - $self->{$set}->{rbl_matches_found} .= $ip.' '; - return ($found+1); + if (not $gotdialup) { + dbg("Ignoring return $addr for $ip, not known as dialup for $domainonly in dialup_code variable", "rbl", -2); + next; + } } + + timelog("RBL -> match on $dom", "rbl", 2); + return 1; } } + timelog("RBL -> No match on $dom", "rbl", 2); return 0; } @@ -130,7 +197,7 @@ my ($self) = @_; if ($self->{main}->{local_tests_only}) { - dbg ("local tests only, ignoring Razor"); + dbg ("local tests only, ignoring Razor", "razor", -1); return 0; } @@ -139,30 +206,32 @@ }; if ($@) { - dbg ("Razor is not available"); + dbg ("Razor is not available", "razor", -1); return 0; } else { - dbg ("Razor is available"); + dbg ("Razor is available", "razor", -1); return 1; } } sub razor_lookup { my ($self, $fulltext) = @_; + my $timeout=$self->{conf}->{razor_timeout}; if ($self->{main}->{local_tests_only}) { - dbg ("local tests only, ignoring Razor"); + dbg ("local tests only, ignoring Razor", "razor", -1); return 0; } + timelog("Razor -> Starting razor test ($timeout secs max)", "razor", 1); + my @msg = split (/^/m, $$fulltext); - my $timeout = 10; # seconds my $response = undef; my $config = $self->{conf}->{razor_config}; my %options = ( - 'debug' => $Mail::SpamAssassin::DEBUG + 'debug' => ($Mail::SpamAssassin::DEBUG->{enabled} and $Mail::SpamAssassin::DEBUG->{razor} < -2) ); # razor also debugs to stdout. argh. fix it to stderr... @@ -179,7 +248,7 @@ local ($^W) = 0; # argh, warnings in Razor local $SIG{ALRM} = sub { die "alarm\n" }; - alarm 10; + alarm $timeout; my $rc = Razor::Client->new ($config, %options); @@ -207,7 +276,8 @@ if ($@) { $response = undef; if ($@ =~ /alarm/) { - dbg ("razor check timed out after $timeout secs."); + dbg ("razor check timed out after $timeout secs.", "razor", -1); + timelog("Razor -> interrupted after $timeout secs", "razor", 2); } else { warn ("razor check skipped: $! $@"); } @@ -221,7 +291,11 @@ close OLDOUT; } - if ((defined $response) && ($response+0)) { return 1; } + if ((defined $response) && ($response+0)) { + timelog("Razor -> Finished razor test: confirmed spam", "razor", 2); + return 1; + } + timelog("Razor -> Finished razor test: not known spam", "razor", 2); return 0; } @@ -253,6 +327,7 @@ my %count; my $left; my $right; + my $timeout=$self->{conf}->{dcc_timeout}; $count{body} = 0; $count{fuz1} = 0; @@ -263,13 +338,15 @@ return 0; } + timelog("DCC -> Starting test ($timeout secs max)", "dcc", 1); + eval { my ($dccin, $dccout, $pid); local $SIG{ALRM} = sub { die "alarm\n" }; local $SIG{PIPE} = sub { die "brokenpipe\n" }; - alarm(10); + alarm($timeout); $dccin = gensym(); $dccout = gensym(); @@ -293,18 +370,22 @@ $response = undef; if ($@ =~ /alarm/) { dbg ("DCC check timed out after 10 secs."); + timelog("DCC -> interrupted after $timeout secs", "dcc", 2); return 0; } elsif ($@ =~ /brokenpipe/) { - dbg ("DCC check failed - Broken pipe."); + dbg ("DCC -> check failed - Broken pipe."); + timelog("dcc check failed, broken pipe", "dcc", 2); return 0; } else { - warn ("DCC check skipped: $! $@"); + warn ("DCC -> check skipped: $! $@"); + timelog("dcc check skipped", "dcc", 2); return 0; } } if ($response !~ /^X-DCC/) { - dbg ("DCC check failed - no X-DCC returned (did you create a map file?): $response"); + dbg ("DCC -> check failed - no X-DCC returned (did you create a map file?): $response"); + timelog("dcc check failed", "dcc", 2); return 0; } @@ -331,9 +412,11 @@ if ($count{body} >= $self->{conf}->{dcc_body_max} || $count{fuz1} >= $self->{conf}->{dcc_fuz1_max} || $count{fuz2} >= $self->{conf}->{dcc_fuz2_max}) { dbg ("DCC: Listed! BODY: $count{body} >= $self->{conf}->{dcc_body_max} FUZ1: $count{fuz1} >= $self->{conf}->{dcc_fuz1_max} FUZ2: $count{fuz2} >= $self->{conf}->{dcc_fuz2_max}"); + timelog("DCC -> got hit", "dcc", 2); return 1; } + timelog("DCC -> no match", "dcc", 2); return 0; } diff -urN SpamAssassin-2.21.orig/SpamAssassin/EvalTests.pm SpamAssassin-2.21-realrbl-rbltimeout-timelog/SpamAssassin/EvalTests.pm --- SpamAssassin-2.21.orig/SpamAssassin/EvalTests.pm Thu May 23 17:30:00 2002 +++ SpamAssassin-2.21-realrbl-rbltimeout-timelog/SpamAssassin/EvalTests.pm Mon Jun 3 22:48:55 2002 @@ -556,9 +556,12 @@ ########################################################################### sub check_rbl { - my ($self, $set, $rbl_domain) = @_; + my ($self, $set, $rbl_domain, $needresult) = @_; local ($_); - dbg ("checking RBL $rbl_domain, set $set"); + # How many IPs max you check in the received lines; + my $checklast=$self->{conf}->{num_check_received} - 1; + + dbg ("checking RBL $rbl_domain, set $set", "rbl", -1); my $rcv = $self->get ('Received'); my @ips = ($rcv =~ /\[(\d+\.\d+\.\d+\.\d+)\]/g); @@ -569,9 +572,11 @@ return 0 unless $self->is_dns_available(); $self->load_resolver(); + dbg("Got the following IPs: ".join(", ", @ips), "rbl", -3); if ($#ips > 1) { - @ips = @ips[$#ips-1 .. $#ips]; # only check the originating 2 + @ips = @ips[$#ips-$checklast .. $#ips]; # only check the originating IPs } + dbg("But only inspecting the following IPs: ".join(", ", @ips), "rbl", -3); if (!defined $self->{$set}->{rbl_IN_As_found}) { $self->{$set}->{rbl_IN_As_found} = ' '; @@ -582,17 +587,54 @@ my $already_matched_in_other_zones = ' '.$self->{$set}->{rbl_matches_found}.' '; my $found = 0; - # First check that DNS is available, if not do not perform this check. + # First check that DNS is available. If not, do not perform this check. # Stop after the first positive. eval { + my $i=0; + my ($b1,$b2,$b3,$b4); + my $dialupreturn; foreach my $ip (@ips) { + $i++; next if ($ip =~ /${IP_IN_RESERVED_RANGE}/o); - next if ($already_matched_in_other_zones =~ / ${ip} /); + # Some of the matches in other zones, like a DUL match on a first hop + # may be negated by another rule, so preventing a match in two zones + # is better done with a Z_FUDGE_foo rule that users check_both_rbl_results + # and sets a negative score to compensate + # It's also useful to be able to flag mail that went through an IP that + # is on two different blacklists -- Marc + #next if ($already_matched_in_other_zones =~ / ${ip} /); + if ($already_matched_in_other_zones =~ / ${ip} /) { + dbg("Skipping $ip, already matched in other zones for $set", "rbl", -1); + next; + } next unless ($ip =~ /(\d+)\.(\d+)\.(\d+)\.(\d+)/); - $found = $self->do_rbl_lookup ($set, "$4.$3.$2.$1.".$rbl_domain, $ip, $found); + ($b1, $b2, $b3, $b4) = ($1, $2, $3, $4); + + # By default, we accept any return on an RBL + undef $dialupreturn; + + # foo-firsthop are special rule names that only match on the + # first Received line (used to give a negative score to counter the + # normal dialup rule and not penalize people who relayed through their + # ISP) -- Marc + # By default this rule won't get run unless it's the first hop IP + if ($set =~ /-firsthop$/) { + if ($#ips>0 and $i == $#ips + 1) { + dbg("Set dialupreturn on $ip for first hop", "rbl", -2); + $dialupreturn=$self->{conf}->{dialup_codes}; + die "$self->{conf}->{dialup_codes} undef" if (!defined $dialupreturn); + } else { + dbg("Not running firsthop rule against middle hop or direct dialup IP connection (ip $ip)", "rbl", -2); + next; + } + } + + $found = $self->do_rbl_lookup ($set, "$b4.$b3.$b2.$b1.".$rbl_domain, $ip, $found, $dialupreturn, $needresult); + dbg("Got $found on $ip (item $i)", "rbl", -3); } }; + dbg("Check_rbl returning $found", "rbl", -3); $found; } @@ -601,7 +643,7 @@ sub check_rbl_results_for { my ($self, $set, $addr) = @_; - dbg ("checking RBL results in set $set for $addr"); + dbg ("checking RBL results in set $set for $addr", "rbl", -1); return 0 if $self->{conf}->{skip_rbl_checks}; return 0 unless $self->is_dns_available(); return 0 unless defined ($self->{$set}); @@ -615,6 +657,26 @@ ########################################################################### +sub check_two_rbl_results { + my ($self, $set1, $addr1, $set2, $addr2) = @_; + + return 0 if $self->{conf}->{skip_rbl_checks}; + return 0 unless $self->is_dns_available(); + return 0 unless defined ($self->{$set1}); + return 0 unless defined ($self->{$set2}); + return 0 unless defined ($self->{$set1}->{rbl_IN_As_found}); + return 0 unless defined ($self->{$set2}->{rbl_IN_As_found}); + + my $inas1 = ' '.$self->{$set1}->{rbl_IN_As_found}.' '; + my $inas2 = ' '.$self->{$set2}->{rbl_IN_As_found}.' '; + if ($inas1 =~ / ${addr1} / and $inas2 =~ / ${addr2} /) { return 1; } + + return 0; +} + + +########################################################################### + sub check_for_unique_subject_id { my ($self) = @_; local ($_); @@ -1041,7 +1103,7 @@ my @rcvddatestrs = ($rcvd =~ /\s.?\d+ \S\S\S \d+ \d+:\d+:\d+ \S+/g); my @rcvddates = (); foreach $rcvd (@rcvddatestrs) { - dbg ("trying Received header date for real time: $rcvd"); + dbg ("trying Received header date for real time: $rcvd", "datediff", -2); $rcvd = $self->_parse_rfc822_date ($rcvd); if ($rcvd) { push (@rcvddates, $rcvd); @@ -1049,7 +1111,7 @@ } if ($#rcvddates <= 0) { - dbg ("no dates found in Received headers, not raising flag"); + dbg ("no dates found in Received headers, not raising flag", "datediff", -1); return 0; } @@ -1057,7 +1119,7 @@ foreach $rcvd (@rcvddates) { my $diff = $time - $rcvd; - dbg ("time_t from date=$time, rcvd=$rcvd, diff=$diff"); + dbg ("time_t from date=$time, rcvd=$rcvd, diff=$diff", "datediff", -2); push(@diffs, $diff); } diff -urN SpamAssassin-2.21.orig/SpamAssassin/PerMsgStatus.pm SpamAssassin-2.21-realrbl-rbltimeout-timelog/SpamAssassin/PerMsgStatus.pm --- SpamAssassin-2.21.orig/SpamAssassin/PerMsgStatus.pm Thu May 23 17:30:00 2002 +++ SpamAssassin-2.21-realrbl-rbltimeout-timelog/SpamAssassin/PerMsgStatus.pm Wed Jun 5 00:15:56 2002 @@ -83,8 +82,20 @@ $self->remove_unwanted_headers(); { + # If you run timelog from within specified rules, prefix the message with + # "Rulename -> " so that it's easy to pick out details from the overview + # -- Marc + timelog("Launching RBL queries in the background", "rblbg", 1); + # Here, we launch all the DNS RBL queries and let them run while we + # inspect the message -- Marc + $self->do_rbl_eval_tests(0); + timelog("Finished launching RBL queries in the background", "rblbg", 22); + + timelog("Starting head tests", "headtest", 1); $self->do_head_tests(); + timelog("Finished head tests", "headtest", 2); + timelog("Starting body tests", "bodytest", 1); # do body tests with decoded portions { my $decoded = $self->get_decoded_stripped_body_text_array(); @@ -94,7 +105,9 @@ $self->do_body_uri_tests($decoded); undef $decoded; } + timelog("Finished body tests", "bodytest", 2); + timelog("Starting raw body tests", "rawbodytest", 1); # do rawbody tests with raw text portions { my $bodytext = $self->get_decoded_body_text_array(); @@ -102,7 +115,9 @@ $self->do_rawbody_eval_tests($bodytext); undef $bodytext; } + timelog("Finished raw body tests", "rawbodytest", 2); + timelog("Starting full message tests", "fullmsgtest", 1); # and do full tests: first with entire, full, undecoded message # still skip application/image attachments though { @@ -112,8 +127,18 @@ $self->do_full_eval_tests(\$fulltext); undef $fulltext; } + timelog("Finished full message tests", "fullmsgtest", 2); + timelog("Starting head eval tests", "headevaltest", 1); $self->do_head_eval_tests(); + timelog("Finished head eval tests", "headevaltest", 2); + + timelog("Starting RBL tests (will wait up to $self->{conf}->{dns_timeout} secs before giving up)", "rblblock", 1); + # This time we want to harvest the DNS results -- Marc + $self->do_rbl_eval_tests(1); + # And now we can compute rules that depend on those results + $self->do_rbl_res_eval_tests(); + timelog("Finished all RBL tests", "rblblock", 2); # Do AWL tests last, since these need the score to have already been calculated $self->do_awl_tests(); @@ -1002,7 +1027,6 @@ $hdrname =~ s/#/[HASH]/g; # avoid probs with eval below $def =~ s/#/[HASH]/g; - # dbg ("header regexp test '.$rulename.'"); $evalstr .= ' return if ('.$score.' > 0) && $self->{stop_at_threshold} && $self->is_spam(); if ($self->{conf}->{scores}->{q#'.$rulename.'#}) { @@ -1015,7 +1039,10 @@ $_ = shift; if ($self->get(q#'.$hdrname.'#, q#'.$def.'#) '.$testtype.'~ '.$pat.') { $self->got_hit (q#'.$rulename.'#, q{}); - } + dbg("Ran header regex rule '.$rulename.' ======> got hit", "rulesrun", 1); + } else { + dbg("Ran header regex rule '.$rulename.' but did not get hit", "rulesrun", 1); + } } '; } @@ -1091,7 +1118,12 @@ sub '.$rulename.'_body_test { my $self = shift; $_ = shift; - if ('.$pat.') { $self->got_body_pattern_hit (q{'.$rulename.'}); } + if ('.$pat.') { + $self->got_body_pattern_hit (q{'.$rulename.'}); + dbg("Ran body-text regex rule '.$rulename.' ======> got hit", "rulesrun", 2); + } else { + dbg("Ran body-text regex rule '.$rulename.' but did not get hit", "rulesrun", 2); + } } '; } @@ -1278,7 +1310,12 @@ sub '.$rulename.'_uri_test { my $self = shift; $_ = shift; - if ('.$pat.') { $self->got_uri_pattern_hit (q{'.$rulename.'}); } + if ('.$pat.') { + $self->got_uri_pattern_hit (q{'.$rulename.'}); + dbg("Ran uri test rule '.$rulename.' ======> got hit", "rulesrun", 4); + } else { + dbg("Ran uri test rule '.$rulename.' but did not get hit", "rulesrun", 4); + } } '; } @@ -1358,7 +1395,12 @@ sub '.$rulename.'_rawbody_test { my $self = shift; $_ = shift; - if ('.$pat.') { $self->got_body_pattern_hit (q{'.$rulename.'}); } + if ('.$pat.') { + $self->got_body_pattern_hit (q{'.$rulename.'}); + dbg("Ran body_pattern_hit rule '.$rulename.' ======> got hit", "rulesrun", 8); + } else { + dbg("Ran body_pattern_hit rule '.$rulename.' but did not get hit", "rulesrun", 8); + } } '; } @@ -1432,6 +1475,9 @@ if ($self->{conf}->{scores}->{q{'.$rulename.'}}) { if ($$fullmsgref =~ '.$pat.') { $self->got_body_pattern_hit (q{'.$rulename.'}); + dbg("Ran full-text regex rule '.$rulename.' =====> got hit", "rulesrun", 16); + } else { + dbg("Ran full-text regex rule '.$rulename.' but did not get hit", "rulesrun", 16); } } '; @@ -1463,6 +1509,16 @@ ########################################################################### +sub do_rbl_eval_tests { + my ($self, $needresult) = @_; + $self->run_rbl_eval_tests ($self->{conf}->{rbl_evals}, $needresult); +} + +sub do_rbl_res_eval_tests { + my ($self) = @_; + $self->run_rbl_eval_tests ($self->{conf}->{rbl_res_evals}); +} + sub do_head_eval_tests { my ($self) = @_; $self->run_eval_tests ($self->{conf}->{head_evals}, ''); @@ -1566,6 +1622,13 @@ eval { $result = $self->$evalsub(@args); }; + + if ($result) { + dbg("Ran run_eval_test rule $rulename ======> got hit", "rulesrun", 32); + } else { + dbg("Ran run_eval_test rule $rulename but did not get hit", "rulesrun", 32); + } + if ($@) { warn "Failed to run $rulename SpamAssassin test, skipping:\n". "\t($@)\n"; @@ -1578,6 +1641,52 @@ ########################################################################### +sub run_rbl_eval_tests { + my ($self, $evalhash, $needresult) = @_; + my ($rulename, $pat, @args); + local ($_); + + my @tests = keys %{$evalhash}; + + foreach my $rulename (sort (@tests)) { + next unless ($self->{conf}->{scores}->{$rulename}); + my $score = $self->{conf}{scores}{$rulename}; + return if ($score > 0) && $self->{stop_at_threshold} && $self->is_spam(); + my $evalsub = $evalhash->{$rulename}; + + my $result; + $self->clear_test_state(); + + @args = (); + $evalsub =~ s/\s*\((.*?)\)\s*$//; + if (defined $1 && $1 ne '') { push (@args, mk_param($1)); } + + eval { + $result = $self->$evalsub(@args, $needresult); + }; + + # A run with $job eq 0 is just to start DNS queries + if ($needresult eq 1) + { + if ($result) { + dbg("Ran run_rbl_eval_test rule $rulename ======> got hit", "rulesrun", 64); + } else { + dbg("Ran run_rbl_eval_test rule $rulename but did not get hit", "rulesrun", 64); + } + + if ($@) { + warn "Failed to run $rulename RBL SpamAssassin test, skipping:\n". + "\t($@)\n"; + next; + } + + if ($result) { $self->got_hit ($rulename, "RBL: "); } + } + } +} + +########################################################################### + sub got_body_pattern_hit { my ($self, $rulename) = @_; @@ -1755,6 +1864,7 @@ } sub dbg { Mail::SpamAssassin::dbg (@_); } +sub timelog { Mail::SpamAssassin::timelog (@_); } sub sa_die { Mail::SpamAssassin::sa_die (@_); } ########################################################################### diff -urN SpamAssassin-2.21.orig/SpamAssassin.pm SpamAssassin-2.21-realrbl-rbltimeout-timelog/SpamAssassin.pm --- SpamAssassin-2.21.orig/SpamAssassin.pm Tue May 14 17:34:47 2002 +++ SpamAssassin-2.21-realrbl-rbltimeout-timelog/SpamAssassin.pm Wed Jun 5 00:31:31 2002 @@ -59,13 +59,22 @@ use Cwd; use Config; +# Let's not make this required -- Marc +#eval { require Time::HiRes }; +#Time::HiRes->import( qw(time) ) unless $@; +# Unfortunately, the above doesn't work, please FIXME +use Time::HiRes qw ( time ); + use vars qw{ - @ISA $VERSION $SUB_VERSION $HOME_URL $DEBUG + @ISA $VERSION $SUB_VERSION $HOME_URL $DEBUG $TIMELOG @default_rules_path @default_prefs_path @default_userprefs_path @default_userstate_dir @site_rules_path @old_site_rules_path }; +# Create the hash so that it really points to something, otherwise we can't +# get a reference to it -- Marc +$TIMELOG->{dummy}=0; @ISA = qw(); $VERSION = "2.21"; @@ -75,8 +84,6 @@ $HOME_URL = "http://spamassassin.org/"; -$DEBUG = 0; - #__installsitelib__/spamassassin.cf #__installvendorlib__/spamassassin.cf @default_rules_path = qw( @@ -172,7 +179,26 @@ if (!defined $self) { $self = { }; } bless ($self, $class); - if (defined $self->{debug}) { $DEBUG = $self->{debug}+0; } + $DEBUG->{enabled} = 0; + if (defined $self->{debug} and $self->{debug}) { $DEBUG->{enabled} = 1 } + + # This should be moved elsewhere, I know, but SA really needs debug sets + # I'm putting the intialization here for now, move it if you want + + # For each part of the code, you can set debug levels. If the level is + # progressive, use negative numbers (the more negative, the move debug info + # is put out), and if you want to use bit fields, use positive numbers + # All code path debug codes should be listed here with a value of 0 if you + # want them disabled -- Marc + + $DEBUG->{datediff}=-1; + $DEBUG->{razor}=-3; + $DEBUG->{rbl}=0; + $DEBUG->{timelog}=0; + # Bitfield: + # header regex: 1 | body-text: 2 | uri tests: 4 | raw-body-text: 8 + # full-text regexp: 16 | run_eval_tests: 32 | run_rbl_eval_tests: 64 + $DEBUG->{rulesrun}=64; $self->{conf} ||= new Mail::SpamAssassin::Conf ($self); $self; @@ -199,10 +225,17 @@ my ($self, $mail_obj) = @_; local ($_); + timelog("Starting SpamAssassin Check", "SAfull", 1); $self->init(1); + timelog("Init completed"); my $mail = $self->encapsulate_mail_object ($mail_obj); my $msg = Mail::SpamAssassin::PerMsgStatus->new($self, $mail); + chomp($TIMELOG->{mesgid} = $mail_obj->get("Message-Id")); + $TIMELOG->{mesgid} =~ s#<(.*)>#$1#; + timelog("Created message object, checking message", "msgcheck", 1); $msg->check(); + timelog("Done checking message", "msgcheck", 2); + timelog("Done running SpamAssassin", "SAfull", 2); $msg; } @@ -814,8 +847,82 @@ return @addrs; } +# First argument is the message you want to log for that time +# wheredelta is 1 for starting a split on the stopwatch, and 2 for showing the +# instant delta (used to show how long a specific routine took to run) +# deltaslot says which stopwatch you are working with (needs to match for begin +# and end obviously) +sub timelog { + my ($msg, $deltaslot, $wheredelta) = @_; + my $now=time; + my $tl=$Mail::SpamAssassin::TIMELOG; + my $dbg=$Mail::SpamAssassin::DEBUG; + + if ($deltaslot eq "SAfull" and $wheredelta eq 1) { + $tl->{'start'}=$now; + } + + if (defined $wheredelta) { + $tl->{stopwatch}->{$deltaslot}=$now if ($wheredelta eq 1); + if ($wheredelta eq 2) { + if (not defined $tl->{stopwatch}->{$deltaslot}) { + warn("Error: got end of time log for $deltaslot but never got the start\n"); + } else { + $msg.=sprintf(" (Delta: %.3fs)", + $now - $tl->{stopwatch}->{$deltaslot} ); + } + } + } + + $msg=sprintf("%.3f: $msg\n", $now - $tl->{start}); + + if (not ($tl->{logpath} and $tl->{mesgid})) { + push (@{$tl->{keeplogs}}, $msg); + print $msg if ($dbg->{timelog}); + dbg("Log not yet opened, continuing", "timelog", -2); + return; + } + if (not $tl->{flushedlogs} and $tl->{logpath} and $tl->{mesgid}) { + my $file="$tl->{logpath}/".sprintf("%.4f",time)."_$tl->{mesgid}"; + + $tl->{flushedlogs}=1; + dbg("Flushing logs to $file", "timelog", -2); + open (LOG, ">>$file") or warn("Can't open file: $!"); + + while (defined ($_ = shift(@{$tl->{keeplogs}}))) + { + print LOG $_; + } + dbg("Done flushing logs", "timelog", -2); + } + print LOG $msg; + print $msg if ($dbg->{timelog}); +} + + +# Only the first argument is needed, and it can be a reference to a list if +# you want sub dbg { - if ($Mail::SpamAssassin::DEBUG > 0) { warn "debug: ".join('',@_)."\n"; } + my ($msg, $codepath, $level) = @_; + my $dbg=$Mail::SpamAssassin::DEBUG; + + $msg=join('',@{$msg}) if (ref $msg); + + if (defined $codepath) { + if (not defined $dbg->{$codepath}) { + warn("dbg called with codepath $codepath, but it's not defined, skipping (message was \"$msg\"\n"); + return 0; + } elsif (not defined $level) { + warn("dbg called with codepath $codepath, but no level threshold (message was \"$msg\"\n"); + } + } + return if (not $dbg->{enabled}); + # Negative levels are just level numbers, the more negative, the more debug + return if (defined $level and $level<0 and not $dbg->{$codepath} <= $level); + # Positive levels are bit fields + return if (defined $level and $level>0 and not $dbg->{$codepath} & $level); + + warn "debug: $msg\n"; } # sa_die -- used to die with a useful exit code.