Lines 1580-1586
Link Here
|
1580 |
# strip out the (comments) |
1580 |
# strip out the (comments) |
1581 |
$result =~ s/\s*\(.*?\)//g; |
1581 |
$result =~ s/\s*\(.*?\)//g; |
1582 |
# strip out the "quoted text" |
1582 |
# strip out the "quoted text" |
1583 |
$result =~ s/(?<!<)"[^"]*"(?!@)//g; |
1583 |
$result =~ s/(?<!<)"[^"]*"(?!@)//g; #" emacs |
1584 |
# Foo Blah <jm@xxx> or <jm@xxx> |
1584 |
# Foo Blah <jm@xxx> or <jm@xxx> |
1585 |
$result =~ s/^[^<]*?<(.*?)>.*$/$1/; |
1585 |
$result =~ s/^[^<]*?<(.*?)>.*$/$1/; |
1586 |
# multiple addresses on one line? remove all but first |
1586 |
# multiple addresses on one line? remove all but first |
Lines 1625-1640
Link Here
|
1625 |
|
1625 |
|
1626 |
########################################################################### |
1626 |
########################################################################### |
1627 |
|
1627 |
|
1628 |
# Taken from URI and URI::Find |
1628 |
# uri parsing from plain text: |
1629 |
my $reserved = q(;/?:@&=+$,[]\#|); |
1629 |
# The goals are to find URIs in plain text spam that are intended to be clicked on or copy/pasted, but |
1630 |
my $mark = q(-_.!~*'()); #'; emacs |
1630 |
# ignore random strings that might look like URIs, for example in uuencoded files, and to ignore |
1631 |
my $unreserved = "A-Za-z0-9\Q$mark\E\x00-\x08\x0b\x0c\x0e-\x1a\x1c-\x1f"; |
1631 |
# URIs that spammers might seed in spam in ways not visible or clickable to add work to spam filters. |
1632 |
my $uricSet = quotemeta($reserved) . $unreserved . "%"; |
1632 |
# When we extract a domain and look it up in an RBL, an FP on decding that the text is a URI is not much |
|
|
1633 |
# of a problem, as the only cost is an extra RBL lookup. The same FP is worse if the URI is used in matching rule |
1634 |
# because it could lead to a rule FP, as in bug 5780 with WIERD_PORT matching random uuencoded strings. |
1635 |
# The principles of the following code are 1) if ThunderBird or Outlook Express would linkify a string, |
1636 |
# then we should attempt to parse it as a URI; 2) Where TBird and OE parse differently, choose to do what is most |
1637 |
# likely to find a domain for the RBL tests; 3) If it begins with a scheme or www\d*\. or ftp\. assume that |
1638 |
# it is a URI; 4) If it does not then require that the start of the string looks like a FQDN with a valid TLD; |
1639 |
# 5) Reject strings that after parsing, URLDecoding, and redirection processing don't have a valid TLD |
1640 |
# |
1641 |
# We get the entire URI that would be linkified before dealing with it, in order to do the right thing |
1642 |
# with URI-encodings and redirecting URIs. |
1643 |
# |
1644 |
# The delimiters for start of a URI in TBird are @(`{|[\"'<>,\s in OE they are ("<\s |
1645 |
# |
1646 |
# Tbird allows .,?';-! in a URI but ignores [.,?';-!]* at the end. |
1647 |
# TBird's end delimiters are )`{}|[]"<>\s but ) is only an end delmiter if there is no ( in the URI |
1648 |
# OE only uses space as a delimiter, but ignores [~!@#^&*()_+`-={}|[]:";'<>?,.]* at the end. |
1649 |
# |
1650 |
# Both TBird and OE decide that a URI is an email address when there is '@' character embedded in it. |
1651 |
# TBird has some additional restrictions on email URIs: They cannot contain non-ASCII characters and their end |
1652 |
# delimiters include ( and ' |
1653 |
# |
1654 |
# bug 4522: ISO2022 format mail, most commonly Japanese SHIFT-JIS, inserts a three character escape sequence ESC ( . |
1633 |
|
1655 |
|
1634 |
my $schemeRE = qr/(?:https?|ftp|mailto|javascript|file)/i; |
1656 |
# a hybrid of tbird and oe's version of uri parsing |
|
|
1657 |
my $tbirdstartdelim = '><"\'`,{[(|\s' . "\x1b"; # The \x1b as per bug 4522 |
1658 |
my $iso2022shift = "\x1b" . '\(.'; # bug 4522 |
1659 |
my $tbirdenddelim = '><"`}\]{[|\s' . "\x1b"; # The \x1b as per bug 4522 |
1660 |
my $oeignoreatend = '-~!@#^&*()_+=:;\'?,.'; |
1661 |
my $nonASCII = '\x80-\xff'; |
1662 |
my $tbirdenddelimemail = $tbirdenddelim . '(\'' . $nonASCII; # tbird ignores non-ASCII mail addresses for now, until RFC changes |
1663 |
my $tbirdenddelimplusat = $tbirdenddelimemail . '@'; |
1635 |
|
1664 |
|
1636 |
my $uricCheat = $uricSet; |
1665 |
# regexps for finding plain text non-scheme hostnames with valid TLDs. |
1637 |
$uricCheat =~ tr/://d; |
|
|
1638 |
|
1666 |
|
1639 |
# the list from %VALID_TLDS in Util/RegistrarBoundaries.pm, as a |
1667 |
# the list from %VALID_TLDS in Util/RegistrarBoundaries.pm, as a |
1640 |
# Regexp::Optimize optimized regexp ;) accurate as of 20050318 |
1668 |
# Regexp::Optimize optimized regexp ;) accurate as of 20050318 |
Lines 1649-1706
Link Here
|
1649 |
|t[cdfghjklmnoprtvwz]|u[agkmsyz]|v[aceginu]|w[fs]|xxx|y[etu]|z[amw]|ed?u|qa |
1677 |
|t[cdfghjklmnoprtvwz]|u[agkmsyz]|v[aceginu]|w[fs]|xxx|y[etu]|z[amw]|ed?u|qa |
1650 |
)/ix; |
1678 |
)/ix; |
1651 |
|
1679 |
|
1652 |
# from RFC 1035, but allowing domains starting with numbers: |
1680 |
# knownscheme regexp looks for either a https?: or ftp: scheme, or www\d*\. or ftp\. prefix, i.e., likely to start a URL |
1653 |
# $label = q/[A-Za-z\d](?:[A-Za-z\d-]{0,61}[A-Za-z\d])?/; |
1681 |
# schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri |
1654 |
# $domain = qq<$label(?:\.$label)*>; |
1682 |
my $urischemeless = qr/[a-z\d][a-z\d._-]{0,251}\.${tldsRE}\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,251})?/io; |
1655 |
# length($host) <= 255 && $host =~ /^($domain)$/ |
1683 |
my $uriknownscheme = qr/(?:(?:(?:(?:https?)|(?:ftp)):(?:\/\/)?)|(?:(?:www\d{0,2}|ftp)\.))[^$tbirdenddelim]{1,251}/io; |
1656 |
# changes: |
1684 |
my $urimailscheme = qr/(?:mailto:)?[^$tbirdenddelimplusat]{1,251}@[^$tbirdenddelimemail]{1,251}/io; |
1657 |
# massively simplified from grammar, only matches known TLDs, a single |
1685 |
my $tbirdurire = qr/(?:\b|(?<=$iso2022shift)|(?<=[$tbirdstartdelim])) |
1658 |
# dot at end of TLD works |
1686 |
(?:(?:($uriknownscheme)(?=[$tbirdenddelim])) | |
1659 |
# negative look-behinds: |
1687 |
(?:($urimailscheme)(?=[$tbirdenddelimemail])) | |
1660 |
# (?<![a-z\d][.-]) = don't let there be more hostname behind, but |
1688 |
(?:\b($urischemeless)(?=[$tbirdenddelim])))/xo; |
1661 |
# don't miss ".....www.bar.com" or "-----www.foo.com" |
|
|
1662 |
# (?<!.\@) = this will be caught by the email address regular expression |
1663 |
my $schemelessRE = qr/(?<![a-z\d][._-])(?<!.\@)\b[a-z\d] |
1664 |
[a-z\d._-]{0,251} |
1665 |
\.${tldsRE}\.?\b |
1666 |
(?![a-z\d._-]) |
1667 |
/ix; |
1668 |
|
1689 |
|
1669 |
my $uriRe = qr/\b(?:$schemeRE:[$uricCheat]|$schemelessRE)[$uricSet#]*/o; |
|
|
1670 |
|
1671 |
# Taken from Email::Find (thanks Tatso!) |
1672 |
# This is the BNF from RFC 822 |
1673 |
my $esc = '\\\\'; |
1674 |
my $period = '\.'; |
1675 |
my $space = '\040'; |
1676 |
my $open_br = '\['; |
1677 |
my $close_br = '\]'; |
1678 |
my $nonASCII = '\x80-\xff'; |
1679 |
my $ctrl = '\000-\037'; |
1680 |
my $cr_list = '\n\015'; |
1681 |
my $qtext = qq/[^$esc$nonASCII$cr_list\"]/; #" |
1682 |
my $dtext = qq/[^$esc$nonASCII$cr_list$open_br$close_br]/; |
1683 |
my $quoted_pair = qq<$esc>.qq<[^$nonASCII]>; |
1684 |
my $atom_char = qq/[^($space)<>\@,;:\".$esc$open_br$close_br$ctrl$nonASCII]/; |
1685 |
#" |
1686 |
my $atom = qq{(?>$atom_char+)}; |
1687 |
my $quoted_str = qq<\"$qtext*(?:$quoted_pair$qtext*)*\">; #" |
1688 |
my $word = qq<(?:$atom|$quoted_str)>; |
1689 |
my $local_part = qq<$word(?:$period$word)*>; |
1690 |
|
1691 |
# This is a combination of the domain name BNF from RFC 1035 plus the |
1692 |
# domain literal definition from RFC 822, but allowing domains starting |
1693 |
# with numbers. |
1694 |
my $label = q/[A-Za-z\d](?:[A-Za-z\d-]*[A-Za-z\d])?/; |
1695 |
my $domain_ref = qq<$label(?:$period$label)*>; |
1696 |
my $domain_lit = qq<$open_br(?:$dtext|$quoted_pair)*$close_br>; |
1697 |
my $domain = qq<(?:$domain_ref|$domain_lit)>; |
1698 |
|
1699 |
# Finally, the address-spec regex (more or less) |
1700 |
my $Addr_spec_re = qr<$local_part\s*\@\s*$domain>o; |
1701 |
|
1702 |
# TVD: This really belongs in metadata |
1703 |
|
1704 |
=item $status->get_uri_list () |
1690 |
=item $status->get_uri_list () |
1705 |
|
1691 |
|
1706 |
Returns an array of all unique URIs found in the message. It takes |
1692 |
Returns an array of all unique URIs found in the message. It takes |
Lines 1895-1900
Link Here
|
1895 |
# also, if we allow $textary to be passed in, we need to invalidate |
1881 |
# also, if we allow $textary to be passed in, we need to invalidate |
1896 |
# the cache first. fyi. |
1882 |
# the cache first. fyi. |
1897 |
my $textary = $self->get_decoded_stripped_body_text_array(); |
1883 |
my $textary = $self->get_decoded_stripped_body_text_array(); |
|
|
1884 |
my $redirector_patterns = $self->{conf}->{redirector_patterns}; |
1898 |
|
1885 |
|
1899 |
my ($rulename, $pat, @uris); |
1886 |
my ($rulename, $pat, @uris); |
1900 |
local ($_); |
1887 |
local ($_); |
Lines 1903-1952
Link Here
|
1903 |
|
1890 |
|
1904 |
for (@$textary) { |
1891 |
for (@$textary) { |
1905 |
# NOTE: do not modify $_ in this loop |
1892 |
# NOTE: do not modify $_ in this loop |
1906 |
while (/($uriRe)/igo) { |
1893 |
while (/$tbirdurire/igo) { |
1907 |
my $uri = $1; |
1894 |
my $rawuri = $1||$2||$3; |
|
|
1895 |
$rawuri =~ s/(^[^(]*)\).*$/$1/; # as per ThunderBird, ) is an end delimiter if there is no ( preceeding it |
1896 |
$rawuri =~ s/[$oeignoreatend]*$//; # remove trailing string of punctuations that TBird ignores |
1897 |
# skip if there is '..' in the hostname portion of the URI, something we can't catch in the general URI regexp |
1898 |
next if $rawuri =~ /^(?:(?:https?|ftp|mailto):(?:\/\/)?)?[a-z\d.-]*\.\./i; |
1908 |
|
1899 |
|
1909 |
# skip mismatches from URI regular expression |
1900 |
# If it's a hostname that was just sitting out in the |
1910 |
next if $uri =~ /^[a-z\d.-]*\.\./i; # skip ".." |
1901 |
# open, without a protocol, and not inside of an HTML tag, |
1911 |
|
1902 |
# the we should add the proper protocol in front, rather |
1912 |
$uri =~ s/^<(.*)>$/$1/; |
1903 |
# than using the base URI. |
1913 |
$uri =~ s/[\]\)>#]$//; |
1904 |
my $uri = $rawuri; |
1914 |
|
1905 |
my $rblonly; |
1915 |
if ($uri !~ /^${schemeRE}:/io) { |
1906 |
if ($uri !~ /^(?:https?|ftp|mailto|javascript|file):/i) { |
1916 |
# If it's a hostname that was just sitting out in the |
|
|
1917 |
# open, without a protocol, and not inside of an HTML tag, |
1918 |
# the we should add the proper protocol in front, rather |
1919 |
# than using the base URI. |
1920 |
if ($uri =~ /^ftp\./i) { |
1907 |
if ($uri =~ /^ftp\./i) { |
1921 |
push (@uris, $uri); |
|
|
1922 |
$uri = "ftp://$uri"; |
1908 |
$uri = "ftp://$uri"; |
1923 |
} |
1909 |
} |
1924 |
if ($uri =~ /\@/) { |
1910 |
elsif ($uri =~ /^www\d{0,2}\./i) { |
1925 |
push (@uris, $uri); |
1911 |
$uri = "http://$uri"; |
|
|
1912 |
} |
1913 |
elsif ($uri =~ /\@/) { |
1926 |
$uri = "mailto:$uri"; |
1914 |
$uri = "mailto:$uri"; |
1927 |
} |
1915 |
} |
1928 |
else # if ($uri =~ /^www\d*\./i) |
1916 |
else { |
1929 |
{ |
|
|
1930 |
# some spammers are using unschemed URIs to escape filters |
1917 |
# some spammers are using unschemed URIs to escape filters |
1931 |
push (@uris, $uri); |
1918 |
$rblonly = 1; # flag that this is a URI that MUAs don't linkify so only use for RBLs |
1932 |
$uri = "http://$uri"; |
1919 |
$uri = "http://$uri"; |
1933 |
} |
1920 |
} |
1934 |
} |
1921 |
} |
1935 |
|
1922 |
|
1936 |
# warn("uri: got URI: $uri\n"); |
1923 |
if ($uri =~ /^mailto:/) { |
1937 |
push @uris, $uri; |
1924 |
# skip a mail link that does not have a valid TLD or other than one @ after decoding any URLEncoded characters |
1938 |
} |
1925 |
$uri = Mail::SpamAssassin::Util::url_encode($uri) if ($uri =~ /\%(?:2[1-9a-fA-F]|[3-6][0-9a-fA-f]|7[0-9a-eA-E])/); |
1939 |
while (/($Addr_spec_re)/igo) { |
1926 |
next if ($uri !~ /^[^@]+@[^@]+$/); |
1940 |
my $uri = $1; |
1927 |
my $domuri = Mail::SpamAssassin::Util::uri_to_domain($uri); |
|
|
1928 |
next unless $domuri; |
1929 |
push (@uris, $rawuri); |
1930 |
push (@uris, $uri) unless ($rawuri eq $uri); |
1931 |
} |
1941 |
|
1932 |
|
1942 |
# skip mismatches from email address regular expression |
1933 |
next unless ($uri =~/^(?:https?|ftp):/); # at this point only valid if one or the other of these |
1943 |
next unless $uri =~ /\.${tldsRE}\W*$/io; # skip non-TLDs |
|
|
1944 |
|
1934 |
|
1945 |
$uri =~ s/\s*\@\s*/@/; # remove spaces around the '@' |
1935 |
my @tmp = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, $uri); |
1946 |
$uri = "mailto:$uri"; # prepend mailto: |
1936 |
my $goodurifound = 0; |
1947 |
|
1937 |
foreach my $cleanuri (@tmp) { |
1948 |
#warn("uri: got URI: $uri\n"); |
1938 |
my $domain = Mail::SpamAssassin::Util::uri_to_domain($cleanuri); |
1949 |
push @uris, $uri; |
1939 |
if ($domain) { |
|
|
1940 |
# bug 5780: Stop after domain to avoid FP, but do that after all deobfuscation of urlencoding and redirection |
1941 |
$cleanuri =~ s/^(https?:\/\/[^:\/]+).*$/$1/ if $rblonly; |
1942 |
push (@uris, $cleanuri); |
1943 |
$goodurifound = 1; |
1944 |
} |
1945 |
} |
1946 |
next unless $goodurifound; |
1947 |
push @uris, $rawuri unless $rblonly; |
1950 |
} |
1948 |
} |
1951 |
} |
1949 |
} |
1952 |
|
1950 |
|