View | Details | Raw Unified | Return to bug 4691
Collapse All | Expand All

(-)PerMsgStatus.pm (-21 / +223 lines)
Lines 50-55 Link Here
50
use warnings;
50
use warnings;
51
use Carp;
51
use Carp;
52
52
53
# was using to performance test rules using various tmethods..
54
# BEGIN {
55
#  eval { require Time::HiRes };
56
#  Time::HiRes->import( qw(gettimeofday) ) unless $@;
57
#  Time::HiRes->import( qw(tv_interval) ) unless $@;
58
# }
59
53
use Mail::SpamAssassin::Constants qw(:sa);
60
use Mail::SpamAssassin::Constants qw(:sa);
54
use Mail::SpamAssassin::EvalTests;
61
use Mail::SpamAssassin::EvalTests;
55
use Mail::SpamAssassin::Conf;
62
use Mail::SpamAssassin::Conf;
Lines 1772-1778 Link Here
1772
  my ($self, $priority, $textary) = @_;
1779
  my ($self, $priority, $textary) = @_;
1773
  local ($_);
1780
  local ($_);
1774
1781
1775
  dbg("rules: running body-text per-line regexp tests; score so far=".$self->{score});
1782
  dbg("rules: running body-text regexp tests; score so far=".$self->{score});
1776
1783
1777
  my $doing_user_rules = 
1784
  my $doing_user_rules = 
1778
    $self->{conf}->{user_rules_to_compile}->{$Mail::SpamAssassin::Conf::TYPE_BODY_TESTS};
1785
    $self->{conf}->{user_rules_to_compile}->{$Mail::SpamAssassin::Conf::TYPE_BODY_TESTS};
Lines 1810-1824 Link Here
1810
    $evalstr2 .= '
1817
    $evalstr2 .= '
1811
    sub '.$rulename.'_body_test {
1818
    sub '.$rulename.'_body_test {
1812
           my $self = shift;
1819
           my $self = shift;
1813
           foreach (@_) {
1820
1814
             '.$self->hash_line_for_rule($rulename).'
1821
           # currently using Time::HiRes to do performance testing on 
1815
             if ('.$pat.') { 
1822
           # individual rules...
1816
                $self->got_pattern_hit(q{'.$rulename.'}, "BODY: "); 
1823
           # my $start_time = [gettimeofday];
1817
                '. $self->hit_rule_plugin_code($rulename, "body") . '
1824
1818
		# Ok, we hit, stop now.
1825
           my $tmethod = $self->{conf}->{tmethod}->{'.$rulename.'}->{type} || "line";
1819
		last unless $self->{conf}->{tflags}->{q{'.$rulename.'}} =~ /\bmultiple\b/;
1826
           my $args = $self->{conf}->{tmethod}->{'.$rulename.'}->{args} || "";
1820
             }
1827
1828
           # tmethod: line and linerange
1829
           if ($tmethod =~ m/^line/) {
1830
1831
              my $linecount=1;
1832
              my $maxlines = '.$self->{conf}->{tmethod_max_lines}.';
1833
1834
              foreach (@_) {
1835
1836
                # support predefined line number scanning
1837
                if ($tmethod eq "line" && $args =~ m/^\d+$/) {
1838
                   next unless ($linecount == $args);
1839
                }
1840
 
1841
                # support line range scanning
1842
                # do not allow the start/stop delta to exceed the
1843
                # config option tmethod_max_lines
1844
                elsif ($tmethod eq "linerange" && $args =~ m/(\-?\d+):(\-?\d+)/) {
1845
                   my ($start,$stop);
1846
                   if (defined $1) { $start = $1; } else { $start = 0; }
1847
                   if (defined $2) { $stop = $2; } else { $stop = scalar @_; }
1848
                   if ($stop - $start > $maxlines) {
1849
                     $stop = $start + $maxlines;
1850
                   }
1851
                   next unless ($linecount >= $start && $linecount < $stop);
1852
                }
1853
                
1854
                $linecount++;
1855
                '.$self->hash_line_for_rule($rulename).'
1856
                if ('.$pat.') { 
1857
                   $self->got_pattern_hit(q{'.$rulename.'}, "BODY: "); 
1858
                   '. $self->hit_rule_plugin_code($rulename, "body") . '
1859
	           # Ok, we hit, stop now.
1860
   	           last unless $self->{conf}->{tflags}->{q{'.$rulename.'}} =~ /\bmultiple\b/;
1861
                }
1862
              }
1821
           }
1863
           }
1864
1865
           elsif ($tmethod =~ m/^string/) {
1866
1867
              my ($fastbody,$start_pos,$bytes);
1868
              my $maxbytes = '.$self->{conf}->{tmethod_max_bytes}.';
1869
1870
              # if args are passed to method string, we will set start 
1871
              # position and number of bytes which will feed into 
1872
              # a substr call later.  if start and stop are not defined
1873
              # we manually set start=0 and stop=tmethod_max_bytes
1874
              if ($args && $args =~ m/(\-?\d+):(\-?\d+)/) {
1875
                   if (defined $1) { $start_pos = $1; } else { $start_pos = 0; }
1876
                   if (defined $2) { $bytes = $2; } else { $bytes = $maxbytes }
1877
              }
1878
              else {
1879
                   $start_pos = 0;
1880
                   $bytes = $maxbytes;
1881
              }
1882
 
1883
              # append all content lines to fastbody scalar
1884
              # until the size of the scalar exceeds tmethod_max_bytes 
1885
              foreach (@_) {
1886
                $fastbody .= $_;
1887
                if (length $fastbody > $maxbytes) {
1888
                    last;
1889
                }
1890
              }
1891
1892
              # substr the content down based on start pos and # of bytes
1893
              $fastbody = substr($fastbody,$start_pos,$bytes);
1894
1895
              # if the length of fastbody exceeds the config method_block_max_bytes
1896
              # we need to substr it down further to avoid expensive regexp tests
1897
              # ie, string <startpos> 512000 could be expensive depending on startpos
1898
              my $fblen = length $fastbody;
1899
              if ($fblen > $maxbytes) {
1900
                 $fastbody = substr($fastbody,0,$maxbytes);
1901
              }
1902
1903
              # if the tmethod is stringtrim, we need to convert
1904
              # newlines to space, and then convert excess whitespace
1905
              # to a single space.  this is most beneficial in rawbody
1906
              # ruletypes as there is no efficient way currently to get 
1907
              #  html into a single trimmed string.
1908
              if ($tmethod eq "stringtrim") {
1909
                $fastbody =~ s/[\n\r]/ /gs;
1910
                $fastbody =~ s/\s+/ /g;
1911
              }
1912
1913
              '.$self->hash_line_for_rule($rulename).'
1914
              if ($fastbody && $fastbody =~ '.$pat.') {
1915
                 $self->got_pattern_hit(q{'.$rulename.'}, "BODY: "); 
1916
                 '. $self->hit_rule_plugin_code($rulename, "body") . '
1917
              }
1918
              # dbg("rules: '.$rulename.' - start position $start_pos, bytes $bytes, total length $fblen");
1919
           }
1920
           else {
1921
             warn("rules: invalid method type defined for rule '.$rulename.'");
1922
           }
1923
1924
           # my $elapsed_time = tv_interval ($start_time, [gettimeofday]);
1925
           # dbg("rules: '.$rulename.' took $elapsed_time seconds, using method=[$tmethod] args=[$args]");
1926
1822
    }
1927
    }
1823
    ';
1928
    ';
1824
  }
1929
  }
Lines 2312-2329 Link Here
2312
2417
2313
    $evalstr2 .= '
2418
    $evalstr2 .= '
2314
    sub '.$rulename.'_rawbody_test {
2419
    sub '.$rulename.'_rawbody_test {
2315
       my $self = shift;
2420
           my $self = shift;
2316
       foreach (@_) {
2421
2317
         '.$self->hash_line_for_rule($rulename).'
2422
           # currently using Time::HiRes to do performance testing on
2318
         if ('.$pat.') { 
2423
           # individual rules...
2319
            $self->got_pattern_hit(q{'.$rulename.'}, "RAW: ");
2424
           # my $start_time = [gettimeofday];
2320
            '. $self->hit_rule_plugin_code($rulename, "rawbody") . '
2425
2321
            # Ok, we hit, stop now.
2426
           my $tmethod = $self->{conf}->{tmethod}->{'.$rulename.'}->{type} || "line";
2322
	    last unless $self->{conf}->{tflags}->{q{'.$rulename.'}} =~ /\bmultiple\b/;
2427
           my $args = $self->{conf}->{tmethod}->{'.$rulename.'}->{args} || "";
2323
         }
2428
2324
       }
2429
           # tmethod: line and linerange
2325
    }
2430
           if ($tmethod =~ m/^line/) {
2326
    ';
2431
2432
              my $linecount=1;
2433
              my $maxlines = '.$self->{conf}->{tmethod_max_lines_raw}.';
2434
2435
              foreach (@_) {
2436
2437
                # support predefined line number scanning
2438
                if ($tmethod eq "line" && $args =~ m/^\d+$/) {
2439
                   next unless ($linecount == $args);
2440
                }
2441
2442
                # support line range scanning
2443
                # do not allow the start/stop delta to exceed the
2444
                # config option tmethod_max_lines
2445
                elsif ($tmethod eq "linerange" && $args =~ m/(\-?\d+):(\-?\d+)/) {
2446
                   my ($start,$stop);
2447
                   if (defined $1) { $start = $1; } else { $start = 0; }
2448
                   if (defined $2) { $stop = $2; } else { $stop = scalar @_; }
2449
                   if ($stop - $start > $maxlines) {
2450
                     $stop = $start + $maxlines;
2451
                   }
2452
                   next unless ($linecount >= $start && $linecount < $stop);
2453
                }
2454
2455
                $linecount++;
2456
                '.$self->hash_line_for_rule($rulename).'
2457
                if ('.$pat.') {
2458
                   $self->got_pattern_hit(q{'.$rulename.'}, "RAW: ");
2459
                   '. $self->hit_rule_plugin_code($rulename, "rawbody") . '
2460
                   # Ok, we hit, stop now.
2461
                   last unless $self->{conf}->{tflags}->{q{'.$rulename.'}} =~ /\bmultiple\b/;
2462
                }
2463
              }
2464
           }
2465
2466
           elsif ($tmethod =~ m/^string/) {
2467
2468
              my ($fastbody,$start_pos,$bytes);
2469
              my $maxbytes = '.$self->{conf}->{tmethod_max_bytes_raw}.';
2470
2471
              # if args are passed to method string, we will set start
2472
              # position and number of bytes which will feed into
2473
              # a substr call later.  if start and stop are not defined
2474
              # we manually set start=0 and stop=tmethod_max_bytes
2475
              if ($args && $args =~ m/(\-?\d+):(\-?\d+)/) {
2476
                   if (defined $1) { $start_pos = $1; } else { $start_pos = 0; }
2477
                   if (defined $2) { $bytes = $2; } else { $bytes = $maxbytes }
2478
              }
2479
              else {
2480
                   $start_pos = 0;
2481
                   $bytes = $maxbytes;
2482
              }
2483
2484
              # append all content lines to fastbody scalar
2485
              # until the size of the scalar exceeds tmethod_max_bytes
2486
              foreach (@_) {
2487
                $fastbody .= $_;
2488
                if (length $fastbody > $maxbytes) {
2489
                    last;
2490
                }
2491
              }
2492
2493
              # substr the content down based on start pos and # of bytes
2494
              $fastbody = substr($fastbody,$start_pos,$bytes);
2495
2496
              # if the length of fastbody exceeds the config method_block_max_bytes
2497
              # we need to substr it down further to avoid expensive regexp tests
2498
              # ie, string <startpos> 512000 could be expensive depending on startpos
2499
              my $fblen = length $fastbody;
2500
              if ($fblen > $maxbytes) {
2501
                 $fastbody = substr($fastbody,0,$maxbytes);
2502
              }
2503
2504
              # if the tmethod is stringtrim, we need to convert
2505
              # newlines to space, and then convert excess whitespace
2506
              # to a single space.  this is most beneficial in rawbody
2507
              # ruletypes as there is no efficient way currently to get
2508
              #  html into a single trimmed string.
2509
              if ($tmethod eq "stringtrim") {
2510
                $fastbody =~ s/[\n\r]+/ /gs;
2511
                $fastbody =~ s/\s+/ /g;
2512
              }
2513
2514
              '.$self->hash_line_for_rule($rulename).'
2515
              if ($fastbody && $fastbody =~ '.$pat.') {
2516
                 $self->got_pattern_hit(q{'.$rulename.'}, "RAW: ");
2517
                 '. $self->hit_rule_plugin_code($rulename, "rawbody") . '
2518
              }
2519
              #dbg("rules: '.$rulename.' - start position $start_pos, bytes $bytes, total length $fblen");
2520
           }
2521
           else {
2522
             warn("rules: invalid method type defined for rule '.$rulename.'");
2523
           }
2524
2525
           # my $elapsed_time = tv_interval ($start_time, [gettimeofday]);
2526
           # dbg("rules: '.$rulename.' took $elapsed_time seconds, using method=[$tmethod] args=[$args]");
2527
        }
2528
        ';
2327
  }
2529
  }
2328
2530
2329
  # clear out a previous version of this fn, if already defined
2531
  # clear out a previous version of this fn, if already defined
(-)Conf.pm (+222 lines)
Lines 1581-1586 Link Here
1581
    code => \&Mail::SpamAssassin::Conf::Parser::set_template_clear
1581
    code => \&Mail::SpamAssassin::Conf::Parser::set_template_clear
1582
  });
1582
  });
1583
1583
1584
=item tmethod_max_lines_raw             (Default: 128)
1585
1586
this setting set the maximum amount of lines that should be tested
1587
for a C<rawbody> rule when C<tmethod> is set to C<linerange>.  this
1588
will prevent a rule with a large start and stop delta, 
1589
(ie C<tmethod> "linerange 1:1000" ) from becoming very ineffecient 
1590
on large messages.  if you want to scan every line in a message, 
1591
simply omit a tmethod from the rule completely.  C<tmethod> C<linerange>
1592
should be used to improve efficient of a rule when you know
1593
the location of data.  By default, the maximum amount of raw content 
1594
lines scanned will be 128.
1595
1596
=cut
1597
1598
  push (@cmds, {
1599
    setting => 'tmethod_max_lines_raw',
1600
    default => 256,
1601
    type => $CONF_TYPE_NUMERIC
1602
  });
1603
1604
=item tmethod_max_lines			(Default: 16)
1605
1606
this setting set the maximum amount of lines that should be tested
1607
for a C<body> rule when C<tmethod> is set to C<linerange>.  this will
1608
prevent a body rule with a large start and stop delta,
1609
(ie C<tmethod> "linerange 1:1000" ) from becoming very ineffecient
1610
on large messages.  if you want to scan every line in a message,
1611
simply omit a tmethod from the rule completely.  C<tmethod> C<linerange>
1612
should be used to improve efficient of a rule when you know
1613
the location of data.  By default, the maximum amount of body content
1614
lines scanned will be 16.  Realize that body rules by default have
1615
most of the whitespace and newlines trimmed already, so even if the 
1616
message is 100's of lines long, it may be trimmed down to a dozen or
1617
less once due to the nature of the message parser.
1618
1619
=cut
1620
1621
  push (@cmds, {
1622
    setting => 'tmethod_max_lines',
1623
    default => 16,
1624
    type => $CONF_TYPE_NUMERIC
1625
  });
1626
1627
=item tmethod_max_bytes_raw		(Default: 65536)
1628
1629
this setting set the maximum amount of bytes that a regexp can be 
1630
tested against for a rawbody rule when C<tmethod> is set to 
1631
C<string> or C<stringtrim>.  this will prevent a rule with a large
1632
btye start and stop delta, (ie C<tmethod> "string 0 512000" ) from 
1633
becoming ineffecient on large messages.  C<tmethod> C<string>.
1634
should only be used to improve rule effeciency when you know the
1635
general location of data you are testing.  By default, the maximum
1636
amount of usable raw data is 65kb.
1637
1638
=cut
1639
  
1640
  push (@cmds, {
1641
    setting => 'tmethod_max_bytes_raw',
1642
    default => 65536,
1643
    type => $CONF_TYPE_NUMERIC
1644
  });
1645
1646
=item tmethod_max_bytes			(Default: 16384)
1647
1648
this setting set the maximum amount of bytes that a regexp can be
1649
tested against for a C<body> rule when C<tmethod> is set to
1650
C<string> or C<stringtrim>. this will prevent a rule with a large
1651
value for bytes (ie C<tmethod> "string 0 512000" ) from becoming
1652
ineffecient on large messages.  C<tmethod> C<string> should only be 
1653
used to improve rule effeciency when you know the general location 
1654
of data you are testing.  By default, the maximum  amount of usable 
1655
body is 16kb.
1656
1657
=cut
1658
1659
  push (@cmds, {
1660
    setting => 'tmethod_max_bytes',
1661
    default => 16384,
1662
    type => $CONF_TYPE_NUMERIC
1663
  });
1664
1584
=back
1665
=back
1585
1666
1586
=head1 RULE DEFINITIONS AND PRIVILEGED SETTINGS
1667
=head1 RULE DEFINITIONS AND PRIVILEGED SETTINGS
Lines 1933-1938 Link Here
1933
    }
2014
    }
1934
  });
2015
  });
1935
2016
2017
=item fast_body SYMBOLIC_TEST_NAME /pattern/modifiers
2018
2019
Same as C<body> except it predefines a stringtrim C<tmethod> of '0:8192',
2020
which means any fast_body rules will only apply to the first 8kb worth 
2021
of body content max.  If your rule needs access to more body content than 
2022
8kb, use a C<body> rule and define C<tmethod> seperately.
2023
2024
C<fast_body> evals are not supported.
2025
2026
=cut
2027
2028
  push (@cmds, {
2029
    setting => 'fast_body',
2030
    is_frequent => 1,
2031
    is_priv => 1,
2032
    code => sub {
2033
      my ($self, $key, $value, $line) = @_;
2034
      my @values = split(/\s+/, $value, 2);
2035
      if (@values != 2) {
2036
        return $MISSING_REQUIRED_VALUE;
2037
      }
2038
      $self->{parser}->add_test (@values, $TYPE_BODY_TESTS);
2039
      $self->{tmethod}->{$values[0]} = {
2040
          type => 'stringtrim', args => '0:8192'
2041
      };
2042
    }
2043
  });
2044
1936
=item uri SYMBOLIC_TEST_NAME /pattern/modifiers
2045
=item uri SYMBOLIC_TEST_NAME /pattern/modifiers
1937
2046
1938
Define a uri pattern test.  C<pattern> is a Perl regular expression.  Note: as
2047
Define a uri pattern test.  C<pattern> is a Perl regular expression.  Note: as
Lines 2000-2005 Link Here
2000
    }
2109
    }
2001
  });
2110
  });
2002
2111
2112
=item fast_rawbody SYMBOLIC_TEST_NAME /pattern/modifiers
2113
2114
Same as C<rawbody> except it predefines a C<tmethod> C<stringtrim>
2115
of C<0:32768>, which means any fast_rawbody rules will only apply to the 
2116
first 32kb worth of raw-body content max.   If your rule needs access 
2117
to more raw-body content than 32kb, use a C<body> rule and 
2118
define a C<tmethod> seperately.
2119
2120
C<fast_rawbody> evals are not supported.
2121
2122
=cut
2123
2124
  push (@cmds, {
2125
    setting => 'fast_rawbody',
2126
    is_frequent => 1,
2127
    is_priv => 1,
2128
    code => sub {
2129
      my ($self, $key, $value, $line) = @_;
2130
      my @values = split(/\s+/, $value, 2);
2131
      if (@values != 2) {
2132
        return $MISSING_REQUIRED_VALUE;
2133
      }
2134
      $self->{parser}->add_test (@values, $TYPE_RAWBODY_TESTS);
2135
      $self->{tmethod}->{$values[0]} = {
2136
          type => 'stringtrim', args => '0:32768'
2137
      };
2138
    }
2139
  });
2140
2003
=item full SYMBOLIC_TEST_NAME /pattern/modifiers
2141
=item full SYMBOLIC_TEST_NAME /pattern/modifiers
2004
2142
2005
Define a full message pattern test.  C<pattern> is a Perl regular expression.
2143
Define a full message pattern test.  C<pattern> is a Perl regular expression.
Lines 2131-2136 Link Here
2131
    type => $CONF_TYPE_HASH_KEY_VALUE
2269
    type => $CONF_TYPE_HASH_KEY_VALUE
2132
  });
2270
  });
2133
2271
2272
=item tmethod SYMBOLIC_TEST_NAME [ {line|linerange|string|stringtrim} ] [args]
2273
2274
used to alter how a test is performed.  the default method applied
2275
to tests is C<line> with no args, which will apply the regexp
2276
per-line of content.  tmethod currently only support body and rawbody 
2277
rule types.  documentation for each tmethod is below:
2278
2279
=over 4
2280
2281
=item line [line num]
2282
2283
the C<line> tmethod is the default method applied on all rules unless
2284
overridden with a different method.  if C<tmethod> C<line> contains a
2285
C<line num> parameter, only that line number will be used to test
2286
the regexp.
2287
2288
C<line> can be used with C<body> and C<rawbody> rule types.
2289
2290
=item linerange [start:stop]
2291
2292
the C<linerange> method allows you to apply a regexp against only
2293
a defined range of lines C<start:stop>.  0:5 would apply the regexp
2294
to the first 5 lines of content.  -5:0 would apply the regexp to 
2295
the last 5 lines of content.  
2296
2297
C<linerange> can be used with C<body> and C<rawbody> rule types.
2298
2299
=item string  [start_pos:bytes]
2300
2301
the C<string> method allows you to apply a regexp against only
2302
a predefined byte range by first settings a C<start_pos>, and then
2303
defining the amount of C<bytes> to test from that position. the
2304
C<start_pos> may be negative, and in those instances, the position
2305
will be set from the end of the content.  to scan the last 2kb of
2306
content, you could define the parameter as C<-2048:2048>.  
2307
to scan the first 512 bytes of content, C<0:512> would be used.
2308
2309
C<string> can be used with C<body> and C<rawbody> rule types.
2310
2311
=item stringtrim [start_pos] [bytes]
2312
2313
the C<stringtrim> method works identical to the C<string> method
2314
discussed above, except it strips all newline and extra whitespace
2315
from the content before testing.
2316
2317
C<stringtrim> can be used with C<body> and C<rawbody> rule types.
2318
2319
=back
2320
2321
=cut
2322
2323
  push (@cmds, {
2324
    setting => 'tmethod',
2325
    is_frequent => 1,
2326
    is_priv => 1,
2327
    code => sub {
2328
      my ($self, $key, $value, $line) = @_;
2329
2330
      if ($value =~ /^(\S+)\s+(\S+)\s+(.+)$/) {
2331
        my $name = $1;
2332
        my $type = $2;
2333
        my $args = $3;
2334
        dbg("rules: override tmethod for rule $name, type $type, args $args");
2335
        $self->{tmethod}->{$name} = {
2336
          type => $type, args => $args
2337
        };
2338
      }
2339
      elsif ($value =~ /^(\S+)\s+(\S+)$/) {
2340
        my $name = $1;
2341
        my $type = $2;
2342
        dbg("rules: override tmethod for rule $name, type $type");
2343
        $self->{tmethod}->{$name} = {
2344
          type => $type, args => undef
2345
        };
2346
      }
2347
      elsif ($value =~ /^$/) {
2348
        return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
2349
      }
2350
      else {
2351
        return $Mail::SpamAssassin::Conf::INVALID_VALUE;
2352
      }
2353
    }
2354
  });
2355
2134
=item priority SYMBOLIC_TEST_NAME n
2356
=item priority SYMBOLIC_TEST_NAME n
2135
2357
2136
Assign a specific priority to a test.  All tests, except for DNS and Meta
2358
Assign a specific priority to a test.  All tests, except for DNS and Meta

Return to bug 4691