Index: lib/Mail/SpamAssassin/PerMsgStatus.pm =================================================================== --- lib/Mail/SpamAssassin/PerMsgStatus.pm (revision 348981) +++ lib/Mail/SpamAssassin/PerMsgStatus.pm (working copy) @@ -1593,6 +1593,49 @@ ########################################################################### +sub get_range_data { + + my ($self,$rulename,$content) = @_; + my ($arg1, $arg2, $max, $config); + + my $type = $self->{conf}->{range}->{$rulename}->{type} || "line"; + my $args = $self->{conf}->{range}->{$rulename}->{args} || ""; + + dbg("range_data: type=$type args=$args for rule $rulename"); + + $config = 'range_max_'; + if ($type eq "line") { + $config .= $type; + } + else { + $config .= 'byte'; + } + $config .= '_raw' if ($content eq "raw"); + $max = $self->{conf}->{$config}; + + return('line') if (!$type); + return($type,undef,undef,$max) if ($type && !$args); + + if ($args && $args =~ m/(\d+)(:(\d+))?/) { + $arg1=$1; + $arg2=$3 if (defined $3); + } + else { + $arg1=0; + $arg2=$max; + } + + # allow range line references starting at 1 instead of 0. + if ($type eq "line") { + $arg1++; $arg2++; + } + + # prevent delta between args from exceeding max + if (($arg2 - $arg1) > $max) { $arg2 = $max; } + + return ($type,$arg1,$arg2,$max); +} + sub start_rules_plugin_code { my ($self, $ruletype) = @_; @@ -1807,18 +1850,42 @@ next if (!$self->is_user_rule_sub ($rulename.'_body_test')); } - $evalstr2 .= ' + $evalstr2 .= ' sub '.$rulename.'_body_test { - my $self = shift; - foreach (@_) { - '.$self->hash_line_for_rule($rulename).' - if ('.$pat.') { - $self->got_pattern_hit(q{'.$rulename.'}, "BODY: "); - '. $self->hit_rule_plugin_code($rulename, "body") . ' - # Ok, we hit, stop now. - last unless $self->{conf}->{tflags}->{q{'.$rulename.'}} =~ /\bmultiple\b/; - } + my $self=shift; + my $hit=0; + my ($type,$arg1,$arg2) = $self->get_range_data(q{'.$rulename.'},"body"); + if ($type eq "line") { + my $x=0; + foreach (@_) { + if ($arg1) { + next unless ($x >= $arg1 && $x < $arg2); + } + '.$self->hash_line_for_rule($rulename).' + if ('.$pat.') { + $hit++; + '. $self->hit_rule_plugin_code($rulename, "body") . ' + last unless $self->{conf}->{tflags}->{q{'.$rulename.'}} =~ /\bmultiple\b/; + } + $x++; + } } + elsif ($type =~ m/^byte/) { + my $fb = join("",@_); + if ($type eq "bytetrim") { + $fb =~ s/[\n\r]/ /gs; + $fb =~ s/\s+/ /g; + } + $fb = substr($fb,$arg1,$arg2); + '.$self->hash_line_for_rule($rulename).' + pos $fb = 0; + while ($fb =~ '.$pat.'g) { + $hit++; + '. $self->hit_rule_plugin_code($rulename, "body") . ' + last unless $self->{conf}->{tflags}->{q{'.$rulename.'}} =~ /\bmultiple\b/; + } + } + $self->got_hit(q{'.$rulename.'}, "BODY: ",$hit) if ($hit); } '; } @@ -2310,18 +2377,42 @@ next if (!$self->is_user_rule_sub ($rulename.'_rawbody_test')); } - $evalstr2 .= ' + $evalstr2 .= ' sub '.$rulename.'_rawbody_test { - my $self = shift; - foreach (@_) { - '.$self->hash_line_for_rule($rulename).' - if ('.$pat.') { - $self->got_pattern_hit(q{'.$rulename.'}, "RAW: "); - '. $self->hit_rule_plugin_code($rulename, "rawbody") . ' - # Ok, we hit, stop now. - last unless $self->{conf}->{tflags}->{q{'.$rulename.'}} =~ /\bmultiple\b/; - } - } + my $self=shift; + my $hit=0; + my ($type,$arg1,$arg2) = $self->get_range_data(q{'.$rulename.'},"rawbody"); + if ($type eq "line") { + my $x=0; + foreach (@_) { + if ($arg1) { + next unless ($x >= $arg1 && $x < $arg2); + } + '.$self->hash_line_for_rule($rulename).' + if ('.$pat.') { + $hit++; + '. $self->hit_rule_plugin_code($rulename, "rawbody") . ' + last unless $self->{conf}->{tflags}->{q{'.$rulename.'}} =~ /\bmultiple\b/; + } + $x++; + } + } + elsif ($type =~ m/^byte/) { + my $fb = join("",@_); + if ($type eq "bytetrim") { + $fb =~ s/[\n\r]/ /gs; + $fb =~ s/\s+/ /g; + } + $fb = substr($fb,$arg1,$arg2); + '.$self->hash_line_for_rule($rulename).' + pos $fb = 0; + while ($fb =~ '.$pat.'g) { + $hit++; + '. $self->hit_rule_plugin_code($rulename, "rawbody") . ' + last unless $self->{conf}->{tflags}->{q{'.$rulename.'}} =~ /\bmultiple\b/; + } + } + $self->got_hit(q{'.$rulename.'}, "RAW: ",$hit) if ($hit); } '; } Index: lib/Mail/SpamAssassin/Conf.pm =================================================================== --- lib/Mail/SpamAssassin/Conf.pm (revision 348981) +++ lib/Mail/SpamAssassin/Conf.pm (working copy) @@ -1581,6 +1581,87 @@ code => \&Mail::SpamAssassin::Conf::Parser::set_template_clear }); +=item range_max_line_raw (Default: 128) + +this setting sets the maximum amount of lines that should be tested +for a C rule when C is set to C. this +will prevent a rule with a large start and stop delta, +(ie C "line 1:1000" ) from becoming very ineffecient +on large messages. if you want to scan every line in a message, +simply omit a range from the rule completely. C C +should be used to improve efficient of a rule when you know +the location of data. By default, the maximum amount of raw content +lines scanned will be 128. + +=cut + + push (@cmds, { + setting => 'range_max_line_raw', + default => 256, + type => $CONF_TYPE_NUMERIC + }); + +=item range_max_line (Default: 16) + +this setting set the maximum amount of lines that should be tested +for a C rule when C is set to C. this will +prevent a body rule with a large start and stop delta, +(ie C "linerange 1:1000" ) from becoming very ineffecient +on large messages. if you want to scan every line in a message, +simply omit a range from the rule completely. C C +should be used to improve efficient of a rule when you know +the location of data. By default, the maximum amount of body content +lines scanned will be 16. Realize that body rules by default have +most of the whitespace and newlines trimmed already, so even if the +message is 100's of lines long, it may be trimmed down to a dozen or +less once due to the nature of the message parser. + +=cut + + push (@cmds, { + setting => 'range_max_line', + default => 16, + type => $CONF_TYPE_NUMERIC + }); + +=item range_max_byte_raw (Default: 65536) + +this setting set the maximum amount of bytes that a regexp can be +tested against for a rawbody rule when C is set to +C or C. this will prevent a rule with a large +btye start and stop delta, (ie C "byte 0 256000" ) from +becoming ineffecient on large messages. C C. +should only be used to improve rule effeciency when you know the +general location of data you are testing. By default, the maximum +amount of usable raw data is 65kb. + +=cut + + push (@cmds, { + setting => 'range_max_byte_raw', + default => 65536, + type => $CONF_TYPE_NUMERIC + }); + +=item range_max_byte (Default: 16384) + +this setting set the maximum amount of bytes that a regexp can be +tested against for a C rule when C is set to +C or C. this will prevent a rule with a large +value for bytes (ie C "string 0 512000" ) from becoming +ineffecient on large messages. C C should only be +used to improve rule effeciency when you know the general location +of data you are testing. By default, the maximum amount of usable +body is 16kb. + +=cut + + push (@cmds, { + setting => 'range_max_byte', + default => 16384, + type => $CONF_TYPE_NUMERIC + }); + =back =head1 RULE DEFINITIONS AND PRIVILEGED SETTINGS @@ -1933,6 +2014,34 @@ } }); +=item fast_body SYMBOLIC_TEST_NAME /pattern/modifiers + +Same as C except it predefines a C C of '0:8192', +which means any fast_body rules will only apply to the first 8kb worth +of body content max. If your rule needs access to more body content than +8kb, use a C rule and define standalone C seperately. + +C evals are not supported, use C. + +=cut + + push (@cmds, { + setting => 'fast_body', + is_frequent => 1, + is_priv => 1, + code => sub { + my ($self, $key, $value, $line) = @_; + my @values = split(/\s+/, $value, 2); + if (@values != 2) { + return $MISSING_REQUIRED_VALUE; + } + $self->{parser}->add_test (@values, $TYPE_BODY_TESTS); + $self->{range}->{$values[0]} = { + type => 'bytetrim', args => '0:8192' + }; + } + }); + =item uri SYMBOLIC_TEST_NAME /pattern/modifiers Define a uri pattern test. C is a Perl regular expression. Note: as @@ -2000,6 +2109,35 @@ } }); +=item fast_rawbody SYMBOLIC_TEST_NAME /pattern/modifiers + +Same as C except it predefines a C C +of C<0:32768>, which means any fast_rawbody rules will only apply to the +first 32kb worth of raw-body content max. If your rule needs access +to more raw-body content than 32kb, use a C rule and +define a standalone C seperately. + +C evals are not supported, use rawbody. + +=cut + + push (@cmds, { + setting => 'fast_rawbody', + is_frequent => 1, + is_priv => 1, + code => sub { + my ($self, $key, $value, $line) = @_; + my @values = split(/\s+/, $value, 2); + if (@values != 2) { + return $MISSING_REQUIRED_VALUE; + } + $self->{parser}->add_test (@values, $TYPE_RAWBODY_TESTS); + $self->{range}->{$values[0]} = { + type => 'bytetrim', args => '0:32768' + }; + } + }); + =item full SYMBOLIC_TEST_NAME /pattern/modifiers Define a full message pattern test. C is a Perl regular expression. @@ -2131,6 +2269,85 @@ type => $CONF_TYPE_HASH_KEY_VALUE }); +=item range SYMBOLIC_TEST_NAME [ {line|byte|bytetrim} ] [args] + +used to alter how a test is performed. the default method applied +to tests is C with no args, which will apply the regexp +per-line of content. C currently only supports body and rawbody +rule types. documentation for each range option is below: + +=over 4 + +=item line [line_start][:line_stop] + +the C C is the default method applied on all rules unless +overridden with a different method. if C C contains a +C parameter, only that line number will be used to test +the regexp. if it contains the C syntax, only +that range of line numbers will be scanned. + +C can be used with C and C rule types. + +=item byte [start_pos][:bytes] + +the C method allows you to apply a regexp against only +a predefined byte range by first settings a C, and then +defining the amount of C to test from that position. If +the C value is omitted, the config option +C (for rawbody rules), or +C (for body rules), will be used as the default. +the C may be negative, and in those instances, the +position will be set from the end of the content. to scan the last +2kb of content, you could definethe parameter as C<-2048:2048>. to +scan the first 512 bytes of content, C<0:512> would be used. + +C can be used with C and C rule types. + +=item bytetrim [start_pos][:bytes] + +the C method works identical to the C method +documented above, except it strips all newline and extra whitespace +from the content before testing. + +C can be used with C and C rule types. + +=back + +=cut + + push (@cmds, { + setting => 'range', + is_frequent => 1, + is_priv => 1, + code => sub { + my ($self, $key, $value, $line) = @_; + + if ($value =~ /^(\S+)\s+(\S+)\s+(.+)$/) { + my $name = $1; + my $type = $2; + my $args = $3; + dbg("rules: override range for rule $name, type $type, args $args"); + $self->{range}->{$name} = { + type => $type, args => $args + }; + } + elsif ($value =~ /^(\S+)\s+(\S+)$/) { + my $name = $1; + my $type = $2; + dbg("rules: override range for rule $name, type $type"); + $self->{range}->{$name} = { + type => $type, args => undef + }; + } + elsif ($value =~ /^$/) { + return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE; + } + else { + return $Mail::SpamAssassin::Conf::INVALID_VALUE; + } + } + }); + =item priority SYMBOLIC_TEST_NAME n Assign a specific priority to a test. All tests, except for DNS and Meta