Index: PerMsgStatus.pm =================================================================== --- PerMsgStatus.pm (revision 345325) +++ PerMsgStatus.pm (working copy) @@ -50,6 +50,13 @@ use warnings; use Carp; +# was using to performance test rules using various tmethods.. +# BEGIN { +# eval { require Time::HiRes }; +# Time::HiRes->import( qw(gettimeofday) ) unless $@; +# Time::HiRes->import( qw(tv_interval) ) unless $@; +# } + use Mail::SpamAssassin::Constants qw(:sa); use Mail::SpamAssassin::EvalTests; use Mail::SpamAssassin::Conf; @@ -1772,7 +1779,7 @@ my ($self, $priority, $textary) = @_; local ($_); - dbg("rules: running body-text per-line regexp tests; score so far=".$self->{score}); + dbg("rules: running body-text regexp tests; score so far=".$self->{score}); my $doing_user_rules = $self->{conf}->{user_rules_to_compile}->{$Mail::SpamAssassin::Conf::TYPE_BODY_TESTS}; @@ -1810,15 +1817,113 @@ $evalstr2 .= ' sub '.$rulename.'_body_test { my $self = shift; - foreach (@_) { - '.$self->hash_line_for_rule($rulename).' - if ('.$pat.') { - $self->got_pattern_hit(q{'.$rulename.'}, "BODY: "); - '. $self->hit_rule_plugin_code($rulename, "body") . ' - # Ok, we hit, stop now. - last unless $self->{conf}->{tflags}->{q{'.$rulename.'}} =~ /\bmultiple\b/; - } + + # currently using Time::HiRes to do performance testing on + # individual rules... + # my $start_time = [gettimeofday]; + + my $tmethod = $self->{conf}->{tmethod}->{'.$rulename.'}->{type} || "line"; + my $args = $self->{conf}->{tmethod}->{'.$rulename.'}->{args} || ""; + + # tmethod: line and linerange + if ($tmethod =~ m/^line/) { + + my $linecount=1; + my $maxlines = '.$self->{conf}->{tmethod_max_lines}.'; + + foreach (@_) { + + # support predefined line number scanning + if ($tmethod eq "line" && $args =~ m/^\d+$/) { + next unless ($linecount == $args); + } + + # support line range scanning + # do not allow the start/stop delta to exceed the + # config option tmethod_max_lines + elsif ($tmethod eq "linerange" && $args =~ m/(\-?\d+):(\-?\d+)/) { + my ($start,$stop); + if (defined $1) { $start = $1; } else { $start = 0; } + if (defined $2) { $stop = $2; } else { $stop = scalar @_; } + if ($stop - $start > $maxlines) { + $stop = $start + $maxlines; + } + next unless ($linecount >= $start && $linecount < $stop); + } + + $linecount++; + '.$self->hash_line_for_rule($rulename).' + if ('.$pat.') { + $self->got_pattern_hit(q{'.$rulename.'}, "BODY: "); + '. $self->hit_rule_plugin_code($rulename, "body") . ' + # Ok, we hit, stop now. + last unless $self->{conf}->{tflags}->{q{'.$rulename.'}} =~ /\bmultiple\b/; + } + } } + + elsif ($tmethod =~ m/^string/) { + + my ($fastbody,$start_pos,$bytes); + my $maxbytes = '.$self->{conf}->{tmethod_max_bytes}.'; + + # if args are passed to method string, we will set start + # position and number of bytes which will feed into + # a substr call later. if start and stop are not defined + # we manually set start=0 and stop=tmethod_max_bytes + if ($args && $args =~ m/(\-?\d+):(\-?\d+)/) { + if (defined $1) { $start_pos = $1; } else { $start_pos = 0; } + if (defined $2) { $bytes = $2; } else { $bytes = $maxbytes } + } + else { + $start_pos = 0; + $bytes = $maxbytes; + } + + # append all content lines to fastbody scalar + # until the size of the scalar exceeds tmethod_max_bytes + foreach (@_) { + $fastbody .= $_; + if (length $fastbody > $maxbytes) { + last; + } + } + + # substr the content down based on start pos and # of bytes + $fastbody = substr($fastbody,$start_pos,$bytes); + + # if the length of fastbody exceeds the config method_block_max_bytes + # we need to substr it down further to avoid expensive regexp tests + # ie, string 512000 could be expensive depending on startpos + my $fblen = length $fastbody; + if ($fblen > $maxbytes) { + $fastbody = substr($fastbody,0,$maxbytes); + } + + # if the tmethod is stringtrim, we need to convert + # newlines to space, and then convert excess whitespace + # to a single space. this is most beneficial in rawbody + # ruletypes as there is no efficient way currently to get + # html into a single trimmed string. + if ($tmethod eq "stringtrim") { + $fastbody =~ s/[\n\r]/ /gs; + $fastbody =~ s/\s+/ /g; + } + + '.$self->hash_line_for_rule($rulename).' + if ($fastbody && $fastbody =~ '.$pat.') { + $self->got_pattern_hit(q{'.$rulename.'}, "BODY: "); + '. $self->hit_rule_plugin_code($rulename, "body") . ' + } + # dbg("rules: '.$rulename.' - start position $start_pos, bytes $bytes, total length $fblen"); + } + else { + warn("rules: invalid method type defined for rule '.$rulename.'"); + } + + # my $elapsed_time = tv_interval ($start_time, [gettimeofday]); + # dbg("rules: '.$rulename.' took $elapsed_time seconds, using method=[$tmethod] args=[$args]"); + } '; } @@ -2312,18 +2417,115 @@ $evalstr2 .= ' sub '.$rulename.'_rawbody_test { - my $self = shift; - foreach (@_) { - '.$self->hash_line_for_rule($rulename).' - if ('.$pat.') { - $self->got_pattern_hit(q{'.$rulename.'}, "RAW: "); - '. $self->hit_rule_plugin_code($rulename, "rawbody") . ' - # Ok, we hit, stop now. - last unless $self->{conf}->{tflags}->{q{'.$rulename.'}} =~ /\bmultiple\b/; - } - } - } - '; + my $self = shift; + + # currently using Time::HiRes to do performance testing on + # individual rules... + # my $start_time = [gettimeofday]; + + my $tmethod = $self->{conf}->{tmethod}->{'.$rulename.'}->{type} || "line"; + my $args = $self->{conf}->{tmethod}->{'.$rulename.'}->{args} || ""; + + # tmethod: line and linerange + if ($tmethod =~ m/^line/) { + + my $linecount=1; + my $maxlines = '.$self->{conf}->{tmethod_max_lines_raw}.'; + + foreach (@_) { + + # support predefined line number scanning + if ($tmethod eq "line" && $args =~ m/^\d+$/) { + next unless ($linecount == $args); + } + + # support line range scanning + # do not allow the start/stop delta to exceed the + # config option tmethod_max_lines + elsif ($tmethod eq "linerange" && $args =~ m/(\-?\d+):(\-?\d+)/) { + my ($start,$stop); + if (defined $1) { $start = $1; } else { $start = 0; } + if (defined $2) { $stop = $2; } else { $stop = scalar @_; } + if ($stop - $start > $maxlines) { + $stop = $start + $maxlines; + } + next unless ($linecount >= $start && $linecount < $stop); + } + + $linecount++; + '.$self->hash_line_for_rule($rulename).' + if ('.$pat.') { + $self->got_pattern_hit(q{'.$rulename.'}, "RAW: "); + '. $self->hit_rule_plugin_code($rulename, "rawbody") . ' + # Ok, we hit, stop now. + last unless $self->{conf}->{tflags}->{q{'.$rulename.'}} =~ /\bmultiple\b/; + } + } + } + + elsif ($tmethod =~ m/^string/) { + + my ($fastbody,$start_pos,$bytes); + my $maxbytes = '.$self->{conf}->{tmethod_max_bytes_raw}.'; + + # if args are passed to method string, we will set start + # position and number of bytes which will feed into + # a substr call later. if start and stop are not defined + # we manually set start=0 and stop=tmethod_max_bytes + if ($args && $args =~ m/(\-?\d+):(\-?\d+)/) { + if (defined $1) { $start_pos = $1; } else { $start_pos = 0; } + if (defined $2) { $bytes = $2; } else { $bytes = $maxbytes } + } + else { + $start_pos = 0; + $bytes = $maxbytes; + } + + # append all content lines to fastbody scalar + # until the size of the scalar exceeds tmethod_max_bytes + foreach (@_) { + $fastbody .= $_; + if (length $fastbody > $maxbytes) { + last; + } + } + + # substr the content down based on start pos and # of bytes + $fastbody = substr($fastbody,$start_pos,$bytes); + + # if the length of fastbody exceeds the config method_block_max_bytes + # we need to substr it down further to avoid expensive regexp tests + # ie, string 512000 could be expensive depending on startpos + my $fblen = length $fastbody; + if ($fblen > $maxbytes) { + $fastbody = substr($fastbody,0,$maxbytes); + } + + # if the tmethod is stringtrim, we need to convert + # newlines to space, and then convert excess whitespace + # to a single space. this is most beneficial in rawbody + # ruletypes as there is no efficient way currently to get + # html into a single trimmed string. + if ($tmethod eq "stringtrim") { + $fastbody =~ s/[\n\r]+/ /gs; + $fastbody =~ s/\s+/ /g; + } + + '.$self->hash_line_for_rule($rulename).' + if ($fastbody && $fastbody =~ '.$pat.') { + $self->got_pattern_hit(q{'.$rulename.'}, "RAW: "); + '. $self->hit_rule_plugin_code($rulename, "rawbody") . ' + } + #dbg("rules: '.$rulename.' - start position $start_pos, bytes $bytes, total length $fblen"); + } + else { + warn("rules: invalid method type defined for rule '.$rulename.'"); + } + + # my $elapsed_time = tv_interval ($start_time, [gettimeofday]); + # dbg("rules: '.$rulename.' took $elapsed_time seconds, using method=[$tmethod] args=[$args]"); + } + '; } # clear out a previous version of this fn, if already defined Index: Conf.pm =================================================================== --- Conf.pm (revision 345325) +++ Conf.pm (working copy) @@ -1581,6 +1581,87 @@ code => \&Mail::SpamAssassin::Conf::Parser::set_template_clear }); +=item tmethod_max_lines_raw (Default: 128) + +this setting set the maximum amount of lines that should be tested +for a C rule when C is set to C. this +will prevent a rule with a large start and stop delta, +(ie C "linerange 1:1000" ) from becoming very ineffecient +on large messages. if you want to scan every line in a message, +simply omit a tmethod from the rule completely. C C +should be used to improve efficient of a rule when you know +the location of data. By default, the maximum amount of raw content +lines scanned will be 128. + +=cut + + push (@cmds, { + setting => 'tmethod_max_lines_raw', + default => 256, + type => $CONF_TYPE_NUMERIC + }); + +=item tmethod_max_lines (Default: 16) + +this setting set the maximum amount of lines that should be tested +for a C rule when C is set to C. this will +prevent a body rule with a large start and stop delta, +(ie C "linerange 1:1000" ) from becoming very ineffecient +on large messages. if you want to scan every line in a message, +simply omit a tmethod from the rule completely. C C +should be used to improve efficient of a rule when you know +the location of data. By default, the maximum amount of body content +lines scanned will be 16. Realize that body rules by default have +most of the whitespace and newlines trimmed already, so even if the +message is 100's of lines long, it may be trimmed down to a dozen or +less once due to the nature of the message parser. + +=cut + + push (@cmds, { + setting => 'tmethod_max_lines', + default => 16, + type => $CONF_TYPE_NUMERIC + }); + +=item tmethod_max_bytes_raw (Default: 65536) + +this setting set the maximum amount of bytes that a regexp can be +tested against for a rawbody rule when C is set to +C or C. this will prevent a rule with a large +btye start and stop delta, (ie C "string 0 512000" ) from +becoming ineffecient on large messages. C C. +should only be used to improve rule effeciency when you know the +general location of data you are testing. By default, the maximum +amount of usable raw data is 65kb. + +=cut + + push (@cmds, { + setting => 'tmethod_max_bytes_raw', + default => 65536, + type => $CONF_TYPE_NUMERIC + }); + +=item tmethod_max_bytes (Default: 16384) + +this setting set the maximum amount of bytes that a regexp can be +tested against for a C rule when C is set to +C or C. this will prevent a rule with a large +value for bytes (ie C "string 0 512000" ) from becoming +ineffecient on large messages. C C should only be +used to improve rule effeciency when you know the general location +of data you are testing. By default, the maximum amount of usable +body is 16kb. + +=cut + + push (@cmds, { + setting => 'tmethod_max_bytes', + default => 16384, + type => $CONF_TYPE_NUMERIC + }); + =back =head1 RULE DEFINITIONS AND PRIVILEGED SETTINGS @@ -1933,6 +2014,34 @@ } }); +=item fast_body SYMBOLIC_TEST_NAME /pattern/modifiers + +Same as C except it predefines a stringtrim C of '0:8192', +which means any fast_body rules will only apply to the first 8kb worth +of body content max. If your rule needs access to more body content than +8kb, use a C rule and define C seperately. + +C evals are not supported. + +=cut + + push (@cmds, { + setting => 'fast_body', + is_frequent => 1, + is_priv => 1, + code => sub { + my ($self, $key, $value, $line) = @_; + my @values = split(/\s+/, $value, 2); + if (@values != 2) { + return $MISSING_REQUIRED_VALUE; + } + $self->{parser}->add_test (@values, $TYPE_BODY_TESTS); + $self->{tmethod}->{$values[0]} = { + type => 'stringtrim', args => '0:8192' + }; + } + }); + =item uri SYMBOLIC_TEST_NAME /pattern/modifiers Define a uri pattern test. C is a Perl regular expression. Note: as @@ -2000,6 +2109,35 @@ } }); +=item fast_rawbody SYMBOLIC_TEST_NAME /pattern/modifiers + +Same as C except it predefines a C C +of C<0:32768>, which means any fast_rawbody rules will only apply to the +first 32kb worth of raw-body content max. If your rule needs access +to more raw-body content than 32kb, use a C rule and +define a C seperately. + +C evals are not supported. + +=cut + + push (@cmds, { + setting => 'fast_rawbody', + is_frequent => 1, + is_priv => 1, + code => sub { + my ($self, $key, $value, $line) = @_; + my @values = split(/\s+/, $value, 2); + if (@values != 2) { + return $MISSING_REQUIRED_VALUE; + } + $self->{parser}->add_test (@values, $TYPE_RAWBODY_TESTS); + $self->{tmethod}->{$values[0]} = { + type => 'stringtrim', args => '0:32768' + }; + } + }); + =item full SYMBOLIC_TEST_NAME /pattern/modifiers Define a full message pattern test. C is a Perl regular expression. @@ -2131,6 +2269,90 @@ type => $CONF_TYPE_HASH_KEY_VALUE }); +=item tmethod SYMBOLIC_TEST_NAME [ {line|linerange|string|stringtrim} ] [args] + +used to alter how a test is performed. the default method applied +to tests is C with no args, which will apply the regexp +per-line of content. tmethod currently only support body and rawbody +rule types. documentation for each tmethod is below: + +=over 4 + +=item line [line num] + +the C tmethod is the default method applied on all rules unless +overridden with a different method. if C C contains a +C parameter, only that line number will be used to test +the regexp. + +C can be used with C and C rule types. + +=item linerange [start:stop] + +the C method allows you to apply a regexp against only +a defined range of lines C. 0:5 would apply the regexp +to the first 5 lines of content. -5:0 would apply the regexp to +the last 5 lines of content. + +C can be used with C and C rule types. + +=item string [start_pos:bytes] + +the C method allows you to apply a regexp against only +a predefined byte range by first settings a C, and then +defining the amount of C to test from that position. the +C may be negative, and in those instances, the position +will be set from the end of the content. to scan the last 2kb of +content, you could define the parameter as C<-2048:2048>. +to scan the first 512 bytes of content, C<0:512> would be used. + +C can be used with C and C rule types. + +=item stringtrim [start_pos] [bytes] + +the C method works identical to the C method +discussed above, except it strips all newline and extra whitespace +from the content before testing. + +C can be used with C and C rule types. + +=back + +=cut + + push (@cmds, { + setting => 'tmethod', + is_frequent => 1, + is_priv => 1, + code => sub { + my ($self, $key, $value, $line) = @_; + + if ($value =~ /^(\S+)\s+(\S+)\s+(.+)$/) { + my $name = $1; + my $type = $2; + my $args = $3; + dbg("rules: override tmethod for rule $name, type $type, args $args"); + $self->{tmethod}->{$name} = { + type => $type, args => $args + }; + } + elsif ($value =~ /^(\S+)\s+(\S+)$/) { + my $name = $1; + my $type = $2; + dbg("rules: override tmethod for rule $name, type $type"); + $self->{tmethod}->{$name} = { + type => $type, args => undef + }; + } + elsif ($value =~ /^$/) { + return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE; + } + else { + return $Mail::SpamAssassin::Conf::INVALID_VALUE; + } + } + }); + =item priority SYMBOLIC_TEST_NAME n Assign a specific priority to a test. All tests, except for DNS and Meta