Index: PerMsgStatus.pm
===================================================================
--- PerMsgStatus.pm	(revision 345325)
+++ PerMsgStatus.pm	(working copy)
@@ -50,6 +50,13 @@
 use warnings;
 use Carp;
 
+# was using to performance test rules using various tmethods..
+# BEGIN {
+#  eval { require Time::HiRes };
+#  Time::HiRes->import( qw(gettimeofday) ) unless $@;
+#  Time::HiRes->import( qw(tv_interval) ) unless $@;
+# }
+
 use Mail::SpamAssassin::Constants qw(:sa);
 use Mail::SpamAssassin::EvalTests;
 use Mail::SpamAssassin::Conf;
@@ -1772,7 +1779,7 @@
   my ($self, $priority, $textary) = @_;
   local ($_);
 
-  dbg("rules: running body-text per-line regexp tests; score so far=".$self->{score});
+  dbg("rules: running body-text regexp tests; score so far=".$self->{score});
 
   my $doing_user_rules = 
     $self->{conf}->{user_rules_to_compile}->{$Mail::SpamAssassin::Conf::TYPE_BODY_TESTS};
@@ -1810,15 +1817,113 @@
     $evalstr2 .= '
     sub '.$rulename.'_body_test {
            my $self = shift;
-           foreach (@_) {
-             '.$self->hash_line_for_rule($rulename).'
-             if ('.$pat.') { 
-                $self->got_pattern_hit(q{'.$rulename.'}, "BODY: "); 
-                '. $self->hit_rule_plugin_code($rulename, "body") . '
-		# Ok, we hit, stop now.
-		last unless $self->{conf}->{tflags}->{q{'.$rulename.'}} =~ /\bmultiple\b/;
-             }
+
+           # currently using Time::HiRes to do performance testing on 
+           # individual rules...
+           # my $start_time = [gettimeofday];
+
+           my $tmethod = $self->{conf}->{tmethod}->{'.$rulename.'}->{type} || "line";
+           my $args = $self->{conf}->{tmethod}->{'.$rulename.'}->{args} || "";
+
+           # tmethod: line and linerange
+           if ($tmethod =~ m/^line/) {
+
+              my $linecount=1;
+              my $maxlines = '.$self->{conf}->{tmethod_max_lines}.';
+
+              foreach (@_) {
+
+                # support predefined line number scanning
+                if ($tmethod eq "line" && $args =~ m/^\d+$/) {
+                   next unless ($linecount == $args);
+                }
+ 
+                # support line range scanning
+                # do not allow the start/stop delta to exceed the
+                # config option tmethod_max_lines
+                elsif ($tmethod eq "linerange" && $args =~ m/(\-?\d+):(\-?\d+)/) {
+                   my ($start,$stop);
+                   if (defined $1) { $start = $1; } else { $start = 0; }
+                   if (defined $2) { $stop = $2; } else { $stop = scalar @_; }
+                   if ($stop - $start > $maxlines) {
+                     $stop = $start + $maxlines;
+                   }
+                   next unless ($linecount >= $start && $linecount < $stop);
+                }
+                
+                $linecount++;
+                '.$self->hash_line_for_rule($rulename).'
+                if ('.$pat.') { 
+                   $self->got_pattern_hit(q{'.$rulename.'}, "BODY: "); 
+                   '. $self->hit_rule_plugin_code($rulename, "body") . '
+	           # Ok, we hit, stop now.
+   	           last unless $self->{conf}->{tflags}->{q{'.$rulename.'}} =~ /\bmultiple\b/;
+                }
+              }
            }
+
+           elsif ($tmethod =~ m/^string/) {
+
+              my ($fastbody,$start_pos,$bytes);
+              my $maxbytes = '.$self->{conf}->{tmethod_max_bytes}.';
+
+              # if args are passed to method string, we will set start 
+              # position and number of bytes which will feed into 
+              # a substr call later.  if start and stop are not defined
+              # we manually set start=0 and stop=tmethod_max_bytes
+              if ($args && $args =~ m/(\-?\d+):(\-?\d+)/) {
+                   if (defined $1) { $start_pos = $1; } else { $start_pos = 0; }
+                   if (defined $2) { $bytes = $2; } else { $bytes = $maxbytes }
+              }
+              else {
+                   $start_pos = 0;
+                   $bytes = $maxbytes;
+              }
+ 
+              # append all content lines to fastbody scalar
+              # until the size of the scalar exceeds tmethod_max_bytes 
+              foreach (@_) {
+                $fastbody .= $_;
+                if (length $fastbody > $maxbytes) {
+                    last;
+                }
+              }
+
+              # substr the content down based on start pos and # of bytes
+              $fastbody = substr($fastbody,$start_pos,$bytes);
+
+              # if the length of fastbody exceeds the config method_block_max_bytes
+              # we need to substr it down further to avoid expensive regexp tests
+              # ie, string <startpos> 512000 could be expensive depending on startpos
+              my $fblen = length $fastbody;
+              if ($fblen > $maxbytes) {
+                 $fastbody = substr($fastbody,0,$maxbytes);
+              }
+
+              # if the tmethod is stringtrim, we need to convert
+              # newlines to space, and then convert excess whitespace
+              # to a single space.  this is most beneficial in rawbody
+              # ruletypes as there is no efficient way currently to get 
+              #  html into a single trimmed string.
+              if ($tmethod eq "stringtrim") {
+                $fastbody =~ s/[\n\r]/ /gs;
+                $fastbody =~ s/\s+/ /g;
+              }
+
+              '.$self->hash_line_for_rule($rulename).'
+              if ($fastbody && $fastbody =~ '.$pat.') {
+                 $self->got_pattern_hit(q{'.$rulename.'}, "BODY: "); 
+                 '. $self->hit_rule_plugin_code($rulename, "body") . '
+              }
+              # dbg("rules: '.$rulename.' - start position $start_pos, bytes $bytes, total length $fblen");
+           }
+           else {
+             warn("rules: invalid method type defined for rule '.$rulename.'");
+           }
+
+           # my $elapsed_time = tv_interval ($start_time, [gettimeofday]);
+           # dbg("rules: '.$rulename.' took $elapsed_time seconds, using method=[$tmethod] args=[$args]");
+
     }
     ';
   }
@@ -2312,18 +2417,115 @@
 
     $evalstr2 .= '
     sub '.$rulename.'_rawbody_test {
-       my $self = shift;
-       foreach (@_) {
-         '.$self->hash_line_for_rule($rulename).'
-         if ('.$pat.') { 
-            $self->got_pattern_hit(q{'.$rulename.'}, "RAW: ");
-            '. $self->hit_rule_plugin_code($rulename, "rawbody") . '
-            # Ok, we hit, stop now.
-	    last unless $self->{conf}->{tflags}->{q{'.$rulename.'}} =~ /\bmultiple\b/;
-         }
-       }
-    }
-    ';
+           my $self = shift;
+
+           # currently using Time::HiRes to do performance testing on
+           # individual rules...
+           # my $start_time = [gettimeofday];
+
+           my $tmethod = $self->{conf}->{tmethod}->{'.$rulename.'}->{type} || "line";
+           my $args = $self->{conf}->{tmethod}->{'.$rulename.'}->{args} || "";
+
+           # tmethod: line and linerange
+           if ($tmethod =~ m/^line/) {
+
+              my $linecount=1;
+              my $maxlines = '.$self->{conf}->{tmethod_max_lines_raw}.';
+
+              foreach (@_) {
+
+                # support predefined line number scanning
+                if ($tmethod eq "line" && $args =~ m/^\d+$/) {
+                   next unless ($linecount == $args);
+                }
+
+                # support line range scanning
+                # do not allow the start/stop delta to exceed the
+                # config option tmethod_max_lines
+                elsif ($tmethod eq "linerange" && $args =~ m/(\-?\d+):(\-?\d+)/) {
+                   my ($start,$stop);
+                   if (defined $1) { $start = $1; } else { $start = 0; }
+                   if (defined $2) { $stop = $2; } else { $stop = scalar @_; }
+                   if ($stop - $start > $maxlines) {
+                     $stop = $start + $maxlines;
+                   }
+                   next unless ($linecount >= $start && $linecount < $stop);
+                }
+
+                $linecount++;
+                '.$self->hash_line_for_rule($rulename).'
+                if ('.$pat.') {
+                   $self->got_pattern_hit(q{'.$rulename.'}, "RAW: ");
+                   '. $self->hit_rule_plugin_code($rulename, "rawbody") . '
+                   # Ok, we hit, stop now.
+                   last unless $self->{conf}->{tflags}->{q{'.$rulename.'}} =~ /\bmultiple\b/;
+                }
+              }
+           }
+
+           elsif ($tmethod =~ m/^string/) {
+
+              my ($fastbody,$start_pos,$bytes);
+              my $maxbytes = '.$self->{conf}->{tmethod_max_bytes_raw}.';
+
+              # if args are passed to method string, we will set start
+              # position and number of bytes which will feed into
+              # a substr call later.  if start and stop are not defined
+              # we manually set start=0 and stop=tmethod_max_bytes
+              if ($args && $args =~ m/(\-?\d+):(\-?\d+)/) {
+                   if (defined $1) { $start_pos = $1; } else { $start_pos = 0; }
+                   if (defined $2) { $bytes = $2; } else { $bytes = $maxbytes }
+              }
+              else {
+                   $start_pos = 0;
+                   $bytes = $maxbytes;
+              }
+
+              # append all content lines to fastbody scalar
+              # until the size of the scalar exceeds tmethod_max_bytes
+              foreach (@_) {
+                $fastbody .= $_;
+                if (length $fastbody > $maxbytes) {
+                    last;
+                }
+              }
+
+              # substr the content down based on start pos and # of bytes
+              $fastbody = substr($fastbody,$start_pos,$bytes);
+
+              # if the length of fastbody exceeds the config method_block_max_bytes
+              # we need to substr it down further to avoid expensive regexp tests
+              # ie, string <startpos> 512000 could be expensive depending on startpos
+              my $fblen = length $fastbody;
+              if ($fblen > $maxbytes) {
+                 $fastbody = substr($fastbody,0,$maxbytes);
+              }
+
+              # if the tmethod is stringtrim, we need to convert
+              # newlines to space, and then convert excess whitespace
+              # to a single space.  this is most beneficial in rawbody
+              # ruletypes as there is no efficient way currently to get
+              #  html into a single trimmed string.
+              if ($tmethod eq "stringtrim") {
+                $fastbody =~ s/[\n\r]+/ /gs;
+                $fastbody =~ s/\s+/ /g;
+              }
+
+              '.$self->hash_line_for_rule($rulename).'
+              if ($fastbody && $fastbody =~ '.$pat.') {
+                 $self->got_pattern_hit(q{'.$rulename.'}, "RAW: ");
+                 '. $self->hit_rule_plugin_code($rulename, "rawbody") . '
+              }
+              #dbg("rules: '.$rulename.' - start position $start_pos, bytes $bytes, total length $fblen");
+           }
+           else {
+             warn("rules: invalid method type defined for rule '.$rulename.'");
+           }
+
+           # my $elapsed_time = tv_interval ($start_time, [gettimeofday]);
+           # dbg("rules: '.$rulename.' took $elapsed_time seconds, using method=[$tmethod] args=[$args]");
+        }
+        ';
   }
 
   # clear out a previous version of this fn, if already defined
Index: Conf.pm
===================================================================
--- Conf.pm	(revision 345325)
+++ Conf.pm	(working copy)
@@ -1581,6 +1581,87 @@
     code => \&Mail::SpamAssassin::Conf::Parser::set_template_clear
   });
 
+=item tmethod_max_lines_raw             (Default: 128)
+
+this setting set the maximum amount of lines that should be tested
+for a C<rawbody> rule when C<tmethod> is set to C<linerange>.  this
+will prevent a rule with a large start and stop delta, 
+(ie C<tmethod> "linerange 1:1000" ) from becoming very ineffecient 
+on large messages.  if you want to scan every line in a message, 
+simply omit a tmethod from the rule completely.  C<tmethod> C<linerange>
+should be used to improve efficient of a rule when you know
+the location of data.  By default, the maximum amount of raw content 
+lines scanned will be 128.
+
+=cut
+
+  push (@cmds, {
+    setting => 'tmethod_max_lines_raw',
+    default => 256,
+    type => $CONF_TYPE_NUMERIC
+  });
+
+=item tmethod_max_lines			(Default: 16)
+
+this setting set the maximum amount of lines that should be tested
+for a C<body> rule when C<tmethod> is set to C<linerange>.  this will
+prevent a body rule with a large start and stop delta,
+(ie C<tmethod> "linerange 1:1000" ) from becoming very ineffecient
+on large messages.  if you want to scan every line in a message,
+simply omit a tmethod from the rule completely.  C<tmethod> C<linerange>
+should be used to improve efficient of a rule when you know
+the location of data.  By default, the maximum amount of body content
+lines scanned will be 16.  Realize that body rules by default have
+most of the whitespace and newlines trimmed already, so even if the 
+message is 100's of lines long, it may be trimmed down to a dozen or
+less once due to the nature of the message parser.
+
+=cut
+
+  push (@cmds, {
+    setting => 'tmethod_max_lines',
+    default => 16,
+    type => $CONF_TYPE_NUMERIC
+  });
+
+=item tmethod_max_bytes_raw		(Default: 65536)
+
+this setting set the maximum amount of bytes that a regexp can be 
+tested against for a rawbody rule when C<tmethod> is set to 
+C<string> or C<stringtrim>.  this will prevent a rule with a large
+btye start and stop delta, (ie C<tmethod> "string 0 512000" ) from 
+becoming ineffecient on large messages.  C<tmethod> C<string>.
+should only be used to improve rule effeciency when you know the
+general location of data you are testing.  By default, the maximum
+amount of usable raw data is 65kb.
+
+=cut
+  
+  push (@cmds, {
+    setting => 'tmethod_max_bytes_raw',
+    default => 65536,
+    type => $CONF_TYPE_NUMERIC
+  });
+
+=item tmethod_max_bytes			(Default: 16384)
+
+this setting set the maximum amount of bytes that a regexp can be
+tested against for a C<body> rule when C<tmethod> is set to
+C<string> or C<stringtrim>. this will prevent a rule with a large
+value for bytes (ie C<tmethod> "string 0 512000" ) from becoming
+ineffecient on large messages.  C<tmethod> C<string> should only be 
+used to improve rule effeciency when you know the general location 
+of data you are testing.  By default, the maximum  amount of usable 
+body is 16kb.
+
+=cut
+
+  push (@cmds, {
+    setting => 'tmethod_max_bytes',
+    default => 16384,
+    type => $CONF_TYPE_NUMERIC
+  });
+
 =back
 
 =head1 RULE DEFINITIONS AND PRIVILEGED SETTINGS
@@ -1933,6 +2014,34 @@
     }
   });
 
+=item fast_body SYMBOLIC_TEST_NAME /pattern/modifiers
+
+Same as C<body> except it predefines a stringtrim C<tmethod> of '0:8192',
+which means any fast_body rules will only apply to the first 8kb worth 
+of body content max.  If your rule needs access to more body content than 
+8kb, use a C<body> rule and define C<tmethod> seperately.
+
+C<fast_body> evals are not supported.
+
+=cut
+
+  push (@cmds, {
+    setting => 'fast_body',
+    is_frequent => 1,
+    is_priv => 1,
+    code => sub {
+      my ($self, $key, $value, $line) = @_;
+      my @values = split(/\s+/, $value, 2);
+      if (@values != 2) {
+        return $MISSING_REQUIRED_VALUE;
+      }
+      $self->{parser}->add_test (@values, $TYPE_BODY_TESTS);
+      $self->{tmethod}->{$values[0]} = {
+          type => 'stringtrim', args => '0:8192'
+      };
+    }
+  });
+
 =item uri SYMBOLIC_TEST_NAME /pattern/modifiers
 
 Define a uri pattern test.  C<pattern> is a Perl regular expression.  Note: as
@@ -2000,6 +2109,35 @@
     }
   });
 
+=item fast_rawbody SYMBOLIC_TEST_NAME /pattern/modifiers
+
+Same as C<rawbody> except it predefines a C<tmethod> C<stringtrim>
+of C<0:32768>, which means any fast_rawbody rules will only apply to the 
+first 32kb worth of raw-body content max.   If your rule needs access 
+to more raw-body content than 32kb, use a C<body> rule and 
+define a C<tmethod> seperately.
+
+C<fast_rawbody> evals are not supported.
+
+=cut
+
+  push (@cmds, {
+    setting => 'fast_rawbody',
+    is_frequent => 1,
+    is_priv => 1,
+    code => sub {
+      my ($self, $key, $value, $line) = @_;
+      my @values = split(/\s+/, $value, 2);
+      if (@values != 2) {
+        return $MISSING_REQUIRED_VALUE;
+      }
+      $self->{parser}->add_test (@values, $TYPE_RAWBODY_TESTS);
+      $self->{tmethod}->{$values[0]} = {
+          type => 'stringtrim', args => '0:32768'
+      };
+    }
+  });
+
 =item full SYMBOLIC_TEST_NAME /pattern/modifiers
 
 Define a full message pattern test.  C<pattern> is a Perl regular expression.
@@ -2131,6 +2269,90 @@
     type => $CONF_TYPE_HASH_KEY_VALUE
   });
 
+=item tmethod SYMBOLIC_TEST_NAME [ {line|linerange|string|stringtrim} ] [args]
+
+used to alter how a test is performed.  the default method applied
+to tests is C<line> with no args, which will apply the regexp
+per-line of content.  tmethod currently only support body and rawbody 
+rule types.  documentation for each tmethod is below:
+
+=over 4
+
+=item line [line num]
+
+the C<line> tmethod is the default method applied on all rules unless
+overridden with a different method.  if C<tmethod> C<line> contains a
+C<line num> parameter, only that line number will be used to test
+the regexp.
+
+C<line> can be used with C<body> and C<rawbody> rule types.
+
+=item linerange [start:stop]
+
+the C<linerange> method allows you to apply a regexp against only
+a defined range of lines C<start:stop>.  0:5 would apply the regexp
+to the first 5 lines of content.  -5:0 would apply the regexp to 
+the last 5 lines of content.  
+
+C<linerange> can be used with C<body> and C<rawbody> rule types.
+
+=item string  [start_pos:bytes]
+
+the C<string> method allows you to apply a regexp against only
+a predefined byte range by first settings a C<start_pos>, and then
+defining the amount of C<bytes> to test from that position. the
+C<start_pos> may be negative, and in those instances, the position
+will be set from the end of the content.  to scan the last 2kb of
+content, you could define the parameter as C<-2048:2048>.  
+to scan the first 512 bytes of content, C<0:512> would be used.
+
+C<string> can be used with C<body> and C<rawbody> rule types.
+
+=item stringtrim [start_pos] [bytes]
+
+the C<stringtrim> method works identical to the C<string> method
+discussed above, except it strips all newline and extra whitespace
+from the content before testing.
+
+C<stringtrim> can be used with C<body> and C<rawbody> rule types.
+
+=back
+
+=cut
+
+  push (@cmds, {
+    setting => 'tmethod',
+    is_frequent => 1,
+    is_priv => 1,
+    code => sub {
+      my ($self, $key, $value, $line) = @_;
+
+      if ($value =~ /^(\S+)\s+(\S+)\s+(.+)$/) {
+        my $name = $1;
+        my $type = $2;
+        my $args = $3;
+        dbg("rules: override tmethod for rule $name, type $type, args $args");
+        $self->{tmethod}->{$name} = {
+          type => $type, args => $args
+        };
+      }
+      elsif ($value =~ /^(\S+)\s+(\S+)$/) {
+        my $name = $1;
+        my $type = $2;
+        dbg("rules: override tmethod for rule $name, type $type");
+        $self->{tmethod}->{$name} = {
+          type => $type, args => undef
+        };
+      }
+      elsif ($value =~ /^$/) {
+        return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
+      }
+      else {
+        return $Mail::SpamAssassin::Conf::INVALID_VALUE;
+      }
+    }
+  });
+
 =item priority SYMBOLIC_TEST_NAME n
 
 Assign a specific priority to a test.  All tests, except for DNS and Meta