Attachment #4248 for bug #5780

View | Details | Raw Unified | Return to bug 5780
Collapse All | Expand All




      # strip out the (comments)
      $result =~ s/\s*\(.*?\)//g;
      # strip out the "quoted text"
      $result =~ s/(?<!<)"[^"]*"(?!@)//g;   #" emacs
      # Foo Blah <jm@xxx> or <jm@xxx>
      $result =~ s/^[^<]*?<(.*?)>.*$/$1/;
      # multiple addresses on one line? remove all but first


###########################################################################

# uri parsing from plain text:
# The goals are to find URIs in plain text spam that are intended to be clicked on or copy/pasted, but
# ignore random strings that might look like URIs, for example in uuencoded files, and to ignore
# URIs that spammers might seed in spam in ways not visible or clickable to add work to spam filters.
# When we extract a domain and look it up in an RBL, an FP on decding that the text is a URI is not much
# of a problem, as the only cost is an extra RBL lookup. The same FP is worse if the URI is used in matching rule
# because it could lead to a rule FP, as in bug 5780 with WIERD_PORT matching random uuencoded strings.
# The principles of the following code are 1) if ThunderBird or Outlook Express would linkify a string,
# then we should attempt to parse it as a URI; 2) Where TBird and OE parse differently, choose to do what is most
# likely to find a domain for the RBL tests; 3) If it begins with a scheme or www\d*\. or ftp\. assume that
# it is a URI; 4) If it does not then require that the start of the string looks like a FQDN with a valid TLD;
# 5) Reject strings that after parsing, URLDecoding, and redirection processing don't have a valid TLD
#
# We get the entire URI that would be linkified before dealing with it, in order to do the right thing
# with URI-encodings and redirecting URIs.
#
# The delimiters for start of a URI in TBird are @(`{|[\"'<>,\s   in OE they are ("<\s
#
# Tbird allows .,?';-! in a URI but ignores [.,?';-!]* at the end.
# TBird's end delimiters are )`{}|[]"<>\s but ) is only an end delmiter if there is no ( in the URI
# OE only uses space as a delimiter, but ignores [~!@#^&*()_+`-={}|[]:";'<>?,.]* at the end.
#
# Both TBird and OE decide that a URI is an email address when there is '@' character embedded in it.
# TBird has some additional restrictions on email URIs: They cannot contain non-ASCII characters and their end
# delimiters include ( and '
#
# bug 4522: ISO2022 format mail, most commonly Japanese SHIFT-JIS, inserts a three character escape sequence  ESC ( .

# a hybrid of tbird and oe's  version of uri parsing
my $tbirdstartdelim = '><"\'`,{[(|\s'  . "\x1b";  # The \x1b as per bug 4522
my $iso2022shift = "\x1b" . '\(.';  # bug 4522
my $tbirdenddelim = '><"`}\]{[|\s' . "\x1b";  # The \x1b as per bug 4522
my $oeignoreatend = '-~!@#^&*()_+=:;\'?,.';
my $nonASCII    = '\x80-\xff';
my $tbirdenddelimemail = $tbirdenddelim . '(\'' . $nonASCII;  # tbird ignores non-ASCII mail addresses for now, until RFC changes
my $tbirdenddelimplusat = $tbirdenddelimemail . '@';

# regexps for finding plain text non-scheme hostnames with valid TLDs.
$uricCheat =~ tr/://d;

# the list from %VALID_TLDS in Util/RegistrarBoundaries.pm, as a
# Regexp::Optimize optimized regexp ;)  accurate as of 20050318

      |t[cdfghjklmnoprtvwz]|u[agkmsyz]|v[aceginu]|w[fs]|xxx|y[etu]|z[amw]|ed?u|qa
    )/ix;

# knownscheme regexp looks for either a https?: or ftp: scheme, or www\d*\. or ftp\. prefix, i.e., likely to start a URL
# schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri
my $urischemeless = qr/[a-z\d][a-z\d._-]{0,251}\.${tldsRE}\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,251})?/io;
my $uriknownscheme = qr/(?:(?:(?:(?:https?)|(?:ftp)):(?:\/\/)?)|(?:(?:www\d{0,2}|ftp)\.))[^$tbirdenddelim]{1,251}/io;
my $urimailscheme = qr/(?:mailto:)?[^$tbirdenddelimplusat]{1,251}@[^$tbirdenddelimemail]{1,251}/io;
my $tbirdurire = qr/(?:\b|(?<=$iso2022shift)|(?<=[$tbirdstartdelim]))
                    (?:(?:($uriknownscheme)(?=[$tbirdenddelim])) |
                       (?:($urimailscheme)(?=[$tbirdenddelimemail])) |
                       (?:\b($urischemeless)(?=[$tbirdenddelim])))/xo;
#                      don't miss ".....www.bar.com" or "-----www.foo.com"
#   (?<!.\@) = this will be caught by the email address regular expression
my $schemelessRE = qr/(?<![a-z\d][._-])(?<!.\@)\b[a-z\d]
                      [a-z\d._-]{0,251}
                      \.${tldsRE}\.?\b
                      (?![a-z\d._-])
                      /ix;

my $uriRe = qr/\b(?:$schemeRE:[$uricCheat]|$schemelessRE)[$uricSet#]*/o;

# Taken from Email::Find (thanks Tatso!)
# This is the BNF from RFC 822
my $esc         = '\\\\';
my $period      = '\.';
my $space       = '\040';
my $open_br     = '\[';
my $close_br    = '\]';
my $nonASCII    = '\x80-\xff';
my $ctrl        = '\000-\037';
my $cr_list     = '\n\015';
my $qtext       = qq/[^$esc$nonASCII$cr_list\"]/; #"
my $dtext       = qq/[^$esc$nonASCII$cr_list$open_br$close_br]/;
my $quoted_pair = qq<$esc>.qq<[^$nonASCII]>;
my $atom_char   = qq/[^($space)<>\@,;:\".$esc$open_br$close_br$ctrl$nonASCII]/;
#"
my $atom        = qq{(?>$atom_char+)};
my $quoted_str  = qq<\"$qtext*(?:$quoted_pair$qtext*)*\">; #"
my $word        = qq<(?:$atom|$quoted_str)>;
my $local_part  = qq<$word(?:$period$word)*>;

# This is a combination of the domain name BNF from RFC 1035 plus the
# domain literal definition from RFC 822, but allowing domains starting
# with numbers.
my $label       = q/[A-Za-z\d](?:[A-Za-z\d-]*[A-Za-z\d])?/;
my $domain_ref  = qq<$label(?:$period$label)*>;
my $domain_lit  = qq<$open_br(?:$dtext|$quoted_pair)*$close_br>;
my $domain      = qq<(?:$domain_ref|$domain_lit)>;

# Finally, the address-spec regex (more or less)
my $Addr_spec_re   = qr<$local_part\s*\@\s*$domain>o;

# TVD: This really belongs in metadata

=item $status->get_uri_list ()

Returns an array of all unique URIs found in the message.  It takes

    # also, if we allow $textary to be passed in, we need to invalidate
    # the cache first. fyi.
    my $textary = $self->get_decoded_stripped_body_text_array();
    my $redirector_patterns = $self->{conf}->{redirector_patterns};

    my ($rulename, $pat, @uris);
    local ($_);


    for (@$textary) {
      # NOTE: do not modify $_ in this loop
      while (/$tbirdurire/igo) {
        my $rawuri = $1||$2||$3;
        $rawuri =~ s/(^[^(]*)\).*$/$1/;  # as per ThunderBird, ) is an end delimiter if there is no ( preceeding it
        $rawuri =~ s/[$oeignoreatend]*$//; # remove trailing string of punctuations that TBird ignores
        # skip if there is '..' in the hostname portion of the URI, something we can't catch in the general URI regexp
        next if $rawuri =~ /^(?:(?:https?|ftp|mailto):(?:\/\/)?)?[a-z\d.-]*\.\./i;

        # If it's a hostname that was just sitting out in the
        # open, without a protocol, and not inside of an HTML tag,
        # the we should add the proper protocol in front, rather
        # than using the base URI.
        my $uri = $rawuri;
        my $rblonly;
        if ($uri !~ /^(?:https?|ftp|mailto|javascript|file):/i) {
          # If it's a hostname that was just sitting out in the
          # open, without a protocol, and not inside of an HTML tag,
          # the we should add the proper protocol in front, rather
          # than using the base URI.
          if ($uri =~ /^ftp\./i) {
            push (@uris, $uri);
            $uri = "ftp://$uri";
          }
          elsif ($uri =~ /^www\d{0,2}\./i) {
            $uri = "http://$uri";
          }
          elsif ($uri =~ /\@/) {
            $uri = "mailto:$uri";
          }
          else {
          {
            # some spammers are using unschemed URIs to escape filters
            $rblonly = 1;    # flag that this is a URI that MUAs don't linkify so only use for RBLs
            $uri = "http://$uri";
          }
        }

        if ($uri =~ /^mailto:/) {
          # skip a mail link that does not have a valid TLD or other than one @ after decoding any URLEncoded characters
          $uri = Mail::SpamAssassin::Util::url_encode($uri) if ($uri =~ /\%(?:2[1-9a-fA-F]|[3-6][0-9a-fA-f]|7[0-9a-eA-E])/);
          next if ($uri !~ /^[^@]+@[^@]+$/);
          my $domuri = Mail::SpamAssassin::Util::uri_to_domain($uri);
          next unless $domuri;
          push (@uris, $rawuri);
          push (@uris, $uri) unless ($rawuri eq $uri);
        }

        next unless ($uri =~/^(?:https?|ftp):/);  # at this point only valid if one or the other of these
        next unless $uri =~ /\.${tldsRE}\W*$/io;	# skip non-TLDs

        my @tmp = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, $uri);
        my $goodurifound = 0;
        foreach my $cleanuri (@tmp) {
          my $domain = Mail::SpamAssassin::Util::uri_to_domain($cleanuri);
          if ($domain) {
            # bug 5780: Stop after domain to avoid FP, but do that after all deobfuscation of urlencoding and redirection
            $cleanuri =~ s/^(https?:\/\/[^:\/]+).*$/$1/ if $rblonly;
            push (@uris, $cleanuri);
            $goodurifound = 1;
          }
        }
        next unless $goodurifound;
        push @uris, $rawuri unless $rblonly;
      }
    }





ok ($urimap{'http://66.92.69.222/'});
ok ($urimap{'http://66.92.69.223/'});
ok ($urimap{'http://66.92.69.224/'});
ok ($urimap{'http://spamassassin.org'});
ok (!$urimap{'CUMSLUTS.'});
ok (!$urimap{'CUMSLUTS..VIRGIN'});



  # undef is valid in some situations, so deal with it...
  if (!defined $expect) {
    warn("try_domains: failed! expect: undefined got: '$result'\n") if (defined $result);
    return !defined $result;
  }
  elsif (!defined $result) {
    warn "try_domains: failed! expect: '$expect' got: undefined\n";
    return 0;
  }





use vars qw(%patterns %anti_patterns);

# settings
plan tests => 134;

# initialize SpamAssassin
my $sa = create_saobj({'dont_copy_prefs' => 1});

.www.kuiH5sai.com	www.kuiH5sai.com

a=www.zaiNgoo7.com	www.zaiNgoo7.com
b@www.vohWais0.com	mailto:b@www.vohWais0.com	!http://www.vohWais0.com
c.www.moSaoga8.com	www.moSaoga8.com

foo @ cae8kaip.com	mailto:foo@cae8kaip.com
xyz..geifoza0.com	!geifoza0
xyz.geifoza1.com/..xyz	xyz.geifoza1.com	!xyz.geifoza1.com/..xyz
xyz.geifoza2.CoM	xyz.geifoza2.CoM
http://xyz..geifoza3.com	!geifoza3
http://xyz.geifoza4.com/..xyz	xyz.geifoza4.com/..xyz
http://xyz.geifoza5.CoM	xyz.geifoza5.CoM

joe@koja3fui.koja3fui	!koja3fui


http://www.luzoop5k.com		http://www.luzoop5k.com
https://www.luzoop5k.com	https://www.luzoop5k.com
ftp://www.luzoop5k.com		ftp://www.luzoop5k.com
mailto:www.luzoop5k.com		!mailto:www.luzoop5k.com
# no longer accept file: scheme
file://www.luzoop5k.com		!file://www.luzoop5k.com

# //<user>:<password>@<host>:<port>/<url-path>
http://user:pass@jiefeet4.com:80/x/y	http://user:pass@jiefeet4.com:80/x/y

www.liy8quei:80				!liy8quei
www.veibi6cu:443			!veibi6cu
puahi8si.com:80				!puahi8si.com:80
chop8tan.com:443			!chop8tan.com:443
www.puahi9si.com:80		puahi9si.com:80
www.chop9tan.com:443	chop9tan.com:443

ftp://name@su5queib.ca//etc/motd	ftp://name@su5queib.ca//etc/motd
ftp://name@faikaj4t.dom/%2Fetc/motd	!ftp://name@faikaj4t.dom//etc/motd
ftp://name@faikaj4t.com/%2Fetc/motd	ftp://name@faikaj4t.com//etc/motd

keyword:sportscar		!sportscar

# questionable tests
mailto://cah3neun@thaihe4d.com		mailto://cah3neun@thaihe4d.com

mailto://jicu8vah@another@jicu8vah	!jicu8vah@another@jicu8vah
mailto://jicu8vah@another@jicu8vah	jicu8vah@another@jicu8vah
baeb1fai@@example.com			!baeb1fai@@example.com
mailto://yie6xuna		!yie6xuna
mailto://yie6xuna@nottld		!yie6xuna@nottld

#mailto://yie6xuna		!yie6xuna

#http://425EE622		http://66.94.230.34
#gopher://www.luzoop5k.com	gopher://www.luzoop5k.com
#nntp://www.luzoop5k.com	nntp://www.luzoop5k.com
#telnet://www.luzoop5k.com	telnet://www.luzoop5k.com
#wais://www.luzoop5k.com	wais://www.luzoop5k.com
#prospero://www.luzoop5k.com	prospero://www.luzoop5k.com
#nfs://www.luzoop5k.com		nfs://www.luzoop5k.com
#pop://www.luzoop5k.com		pop://www.luzoop5k.com
#tel://www.luzoop5k.com		tel://www.luzoop5k.com
#fax://www.luzoop5k.com		fax://www.luzoop5k.com
#modem://www.luzoop5k.com	modem://www.luzoop5k.com
#ldap://www.luzoop5k.com	ldap://www.luzoop5k.com
#im://www.luzoop5k.com		im://www.luzoop5k.com
#snmp://www.luzoop5k.com	snmp://www.luzoop5k.com

<sentto-4934-foo=addr.com@verper.com>	!^http://.*addr.com@verper.com
<sentto-4934-foo=addr.com@verper.com>	mailto:sentto-4934-foo=addr.com@verper.com


{ESC}(Bhttp://foo23499.com/	^http://foo23499.com/$
http://foo23500.com{ESC}(B/	^http://foo23500.com(?:/?)$

M0"-AE/9Y.KN:_0D2F:95^H*:I,8	!9Y\.KN
>delimtest1.com	http://delimtest1.com
<delimtest2.com	http://delimtest2.com
"delimtest3.com	http://delimtest3.com
\delimtest4.com	http://delimtest4.com
'delimtest5.com	http://delimtest5.com
`delimtest6.com	http://delimtest6.com
,delimtest7.com	http://delimtest7.com
{delimtest8.com	http://delimtest8.com
[delimtest9.com	http://delimtest9.com
(delimtest10.com	http://delimtest10.com
|delimtest11.com	http://delimtest11.com
 delimtest12.com	http://delimtest12.com
ignorethishttp://delimtest13.org	http://delimtest13.org
donotignorethiswww.delimtest14.com	donotignorethiswww.delimtest14.com
<www.delimtest15.com/foo-~!@#^&*()_+=:;'?,.xyz-~!@#^&*()_+=:;'?,.>	^http://www.delimtest15.com/foo-~!@#\^&\*\(\)_\+=:;'\?,\.xyz$
.....www.delimtest16.com..........	^http://www.delimtest16.com$
-----www.delimtest17.com----------	^http://www.delimtest17.com$
.....http://www.delimtest18.com..........	^http://www.delimtest18.com$
-----http://www.delimtest19.com----------	^http://www.delimtest19.com$

Return to bug 5780

Lines 1580-1586 Link Here

(-)lib/Mail/SpamAssassin/PerMsgStatus.pm (-91 / +89 lines)
1580	# strip out the (comments)	1580	# strip out the (comments)
1581	$result =~ s/\s\(.?\)//g;	1581	$result =~ s/\s\(.?\)//g;
1582	# strip out the "quoted text"	1582	# strip out the "quoted text"
1583	$result =~ s/(?<!<)"[^"]*"(?!@)//g;	1583	$result =~ s/(?<!<)"[^"]*"(?!@)//g; #" emacs
1584	# Foo Blah <jm@xxx> or <jm@xxx>	1584	# Foo Blah <jm@xxx> or <jm@xxx>
1585	$result =~ s/^[^<]?<(.?)>.*$/$1/;	1585	$result =~ s/^[^<]?<(.?)>.*$/$1/;
1586	# multiple addresses on one line? remove all but first	1586	# multiple addresses on one line? remove all but first
Lines 1625-1640 Link Here
1625		1625
1626	###########################################################################	1626	###########################################################################
1627		1627
1628	# Taken from URI and URI::Find	1628	# uri parsing from plain text:
1629	my $reserved = q(;/?:@&=+$,[]\#\|);	1629	# The goals are to find URIs in plain text spam that are intended to be clicked on or copy/pasted, but
1630	my $mark = q(-_.!~*'()); #'; emacs	1630	# ignore random strings that might look like URIs, for example in uuencoded files, and to ignore
1631	my $unreserved = "A-Za-z0-9\Q$mark\E\x00-\x08\x0b\x0c\x0e-\x1a\x1c-\x1f";	1631	# URIs that spammers might seed in spam in ways not visible or clickable to add work to spam filters.
1632	my $uricSet = quotemeta($reserved) . $unreserved . "%";	1632	# When we extract a domain and look it up in an RBL, an FP on decding that the text is a URI is not much
		1633	# of a problem, as the only cost is an extra RBL lookup. The same FP is worse if the URI is used in matching rule
		1634	# because it could lead to a rule FP, as in bug 5780 with WIERD_PORT matching random uuencoded strings.
		1635	# The principles of the following code are 1) if ThunderBird or Outlook Express would linkify a string,
		1636	# then we should attempt to parse it as a URI; 2) Where TBird and OE parse differently, choose to do what is most
		1637	# likely to find a domain for the RBL tests; 3) If it begins with a scheme or www\d*\. or ftp\. assume that
		1638	# it is a URI; 4) If it does not then require that the start of the string looks like a FQDN with a valid TLD;
		1639	# 5) Reject strings that after parsing, URLDecoding, and redirection processing don't have a valid TLD
		1640	#
		1641	# We get the entire URI that would be linkified before dealing with it, in order to do the right thing
		1642	# with URI-encodings and redirecting URIs.
		1643	#
		1644	# The delimiters for start of a URI in TBird are @(`{\|[\"'<>,\s in OE they are ("<\s
		1645	#
		1646	# Tbird allows .,?';-! in a URI but ignores [.,?';-!]* at the end.
		1647	# TBird's end delimiters are )`{}\|[]"<>\s but ) is only an end delmiter if there is no ( in the URI
		1648	# OE only uses space as a delimiter, but ignores [~!@#^&()_+`-={}\|[]:";'<>?,.] at the end.
		1649	#
		1650	# Both TBird and OE decide that a URI is an email address when there is '@' character embedded in it.
		1651	# TBird has some additional restrictions on email URIs: They cannot contain non-ASCII characters and their end
		1652	# delimiters include ( and '
		1653	#
		1654	# bug 4522: ISO2022 format mail, most commonly Japanese SHIFT-JIS, inserts a three character escape sequence ESC ( .
1633		1655
1634	my $schemeRE = qr/(?:https?\|ftp\|mailto\|javascript\|file)/i;	1656	# a hybrid of tbird and oe's version of uri parsing
		1657	my $tbirdstartdelim = '><"\'`,{[(\|\s' . "\x1b"; # The \x1b as per bug 4522
		1658	my $iso2022shift = "\x1b" . '\(.'; # bug 4522
		1659	my $tbirdenddelim = '><"`}\]{[\|\s' . "\x1b"; # The \x1b as per bug 4522
		1660	my $oeignoreatend = '-~!@#^&*()_+=:;\'?,.';
		1661	my $nonASCII = '\x80-\xff';
		1662	my $tbirdenddelimemail = $tbirdenddelim . '(\'' . $nonASCII; # tbird ignores non-ASCII mail addresses for now, until RFC changes
		1663	my $tbirdenddelimplusat = $tbirdenddelimemail . '@';
1635		1664
1636	my $uricCheat = $uricSet;	1665	# regexps for finding plain text non-scheme hostnames with valid TLDs.
1637	$uricCheat =~ tr/://d;
1638		1666
1639	# the list from %VALID_TLDS in Util/RegistrarBoundaries.pm, as a	1667	# the list from %VALID_TLDS in Util/RegistrarBoundaries.pm, as a
1640	# Regexp::Optimize optimized regexp ;) accurate as of 20050318	1668	# Regexp::Optimize optimized regexp ;) accurate as of 20050318
Lines 1649-1706 Link Here
1649	\|t[cdfghjklmnoprtvwz]\|u[agkmsyz]\|v[aceginu]\|w[fs]\|xxx\|y[etu]\|z[amw]\|ed?u\|qa	1677	\|t[cdfghjklmnoprtvwz]\|u[agkmsyz]\|v[aceginu]\|w[fs]\|xxx\|y[etu]\|z[amw]\|ed?u\|qa
1650	)/ix;	1678	)/ix;
1651		1679
1652	# from RFC 1035, but allowing domains starting with numbers:	1680	# knownscheme regexp looks for either a https?: or ftp: scheme, or www\d*\. or ftp\. prefix, i.e., likely to start a URL
1653	# $label = q/[A-Za-z\d](?:[A-Za-z\d-]{0,61}[A-Za-z\d])?/;	1681	# schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri
1654	# $domain = qq<$label(?:\.$label)*>;	1682	my $urischemeless = qr/[a-z\d][a-z\d._-]{0,251}\.${tldsRE}\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,251})?/io;
1655	# length($host) <= 255 && $host =~ /^($domain)$/	1683	my $uriknownscheme = qr/(?:(?:(?:(?:https?)\|(?:ftp)):(?:\/\/)?)\|(?:(?:www\d{0,2}\|ftp)\.))[^$tbirdenddelim]{1,251}/io;
1656	# changes:	1684	my $urimailscheme = qr/(?:mailto:)?[^$tbirdenddelimplusat]{1,251}@[^$tbirdenddelimemail]{1,251}/io;
1657	# massively simplified from grammar, only matches known TLDs, a single	1685	my $tbirdurire = qr/(?:\b\|(?<=$iso2022shift)\|(?<=[$tbirdstartdelim]))
1658	# dot at end of TLD works	1686	(?:(?:($uriknownscheme)(?=[$tbirdenddelim])) \|
1659	# negative look-behinds:	1687	(?:($urimailscheme)(?=[$tbirdenddelimemail])) \|
1660	# (?<![a-z\d][.-]) = don't let there be more hostname behind, but	1688	(?:\b($urischemeless)(?=[$tbirdenddelim])))/xo;
1661	# don't miss ".....www.bar.com" or "-----www.foo.com"
1662	# (?<!.\@) = this will be caught by the email address regular expression
1663	my $schemelessRE = qr/(?<![a-z\d][._-])(?<!.\@)\b[a-z\d]
1664	[a-z\d._-]{0,251}
1665	\.${tldsRE}\.?\b
1666	(?![a-z\d._-])
1667	/ix;
1668		1689
1669	my $uriRe = qr/\b(?:$schemeRE:[$uricCheat]\|$schemelessRE)[$uricSet#]*/o;
1670
1671	# Taken from Email::Find (thanks Tatso!)
1672	# This is the BNF from RFC 822
1673	my $esc = '\\\\';
1674	my $period = '\.';
1675	my $space = '\040';
1676	my $open_br = '\[';
1677	my $close_br = '\]';
1678	my $nonASCII = '\x80-\xff';
1679	my $ctrl = '\000-\037';
1680	my $cr_list = '\n\015';
1681	my $qtext = qq/[^$esc$nonASCII$cr_list\"]/; #"
1682	my $dtext = qq/[^$esc$nonASCII$cr_list$open_br$close_br]/;
1683	my $quoted_pair = qq<$esc>.qq<[^$nonASCII]>;
1684	my $atom_char = qq/[^($space)<>\@,;:\".$esc$open_br$close_br$ctrl$nonASCII]/;
1685	#"
1686	my $atom = qq{(?>$atom_char+)};
1687	my $quoted_str = qq<\"$qtext(?:$quoted_pair$qtext)*\">; #"
1688	my $word = qq<(?:$atom\|$quoted_str)>;
1689	my $local_part = qq<$word(?:$period$word)*>;
1690
1691	# This is a combination of the domain name BNF from RFC 1035 plus the
1692	# domain literal definition from RFC 822, but allowing domains starting
1693	# with numbers.
1694	my $label = q/[A-Za-z\d](?:[A-Za-z\d-]*[A-Za-z\d])?/;
1695	my $domain_ref = qq<$label(?:$period$label)*>;
1696	my $domain_lit = qq<$open_br(?:$dtext\|$quoted_pair)*$close_br>;
1697	my $domain = qq<(?:$domain_ref\|$domain_lit)>;
1698
1699	# Finally, the address-spec regex (more or less)
1700	my $Addr_spec_re = qr<$local_part\s\@\s$domain>o;
1701
1702	# TVD: This really belongs in metadata
1703
1704	=item $status->get_uri_list ()	1690	=item $status->get_uri_list ()
1705		1691
1706	Returns an array of all unique URIs found in the message. It takes	1692	Returns an array of all unique URIs found in the message. It takes
Lines 1895-1900 Link Here
1895	# also, if we allow $textary to be passed in, we need to invalidate	1881	# also, if we allow $textary to be passed in, we need to invalidate
1896	# the cache first. fyi.	1882	# the cache first. fyi.
1897	my $textary = $self->get_decoded_stripped_body_text_array();	1883	my $textary = $self->get_decoded_stripped_body_text_array();
		1884	my $redirector_patterns = $self->{conf}->{redirector_patterns};
1898		1885
1899	my ($rulename, $pat, @uris);	1886	my ($rulename, $pat, @uris);
1900	local ($_);	1887	local ($_);
Lines 1903-1952 Link Here
1903		1890
1904	for (@$textary) {	1891	for (@$textary) {
1905	# NOTE: do not modify $_ in this loop	1892	# NOTE: do not modify $_ in this loop
1906	while (/($uriRe)/igo) {	1893	while (/$tbirdurire/igo) {
1907	my $uri = $1;	1894	my $rawuri = $1\|\|$2\|\|$3;
		1895	$rawuri =~ s/(^[^(])\).$/$1/; # as per ThunderBird, ) is an end delimiter if there is no ( preceeding it
		1896	$rawuri =~ s/[$oeignoreatend]*$//; # remove trailing string of punctuations that TBird ignores
		1897	# skip if there is '..' in the hostname portion of the URI, something we can't catch in the general URI regexp
		1898	next if $rawuri =~ /^(?:(?:https?\|ftp\|mailto):(?:\/\/)?)?[a-z\d.-]*\.\./i;
1908		1899
1909	# skip mismatches from URI regular expression	1900	# If it's a hostname that was just sitting out in the
1910	next if $uri =~ /^[a-z\d.-]*\.\./i; # skip ".."	1901	# open, without a protocol, and not inside of an HTML tag,
1911		1902	# the we should add the proper protocol in front, rather
1912	$uri =~ s/^<(.*)>$/$1/;	1903	# than using the base URI.
1913	$uri =~ s/[\]\)>#]$//;	1904	my $uri = $rawuri;
1914		1905	my $rblonly;
1915	if ($uri !~ /^${schemeRE}:/io) {	1906	if ($uri !~ /^(?:https?\|ftp\|mailto\|javascript\|file):/i) {
1916	# If it's a hostname that was just sitting out in the
1917	# open, without a protocol, and not inside of an HTML tag,
1918	# the we should add the proper protocol in front, rather
1919	# than using the base URI.
1920	if ($uri =~ /^ftp\./i) {	1907	if ($uri =~ /^ftp\./i) {
1921	push (@uris, $uri);
1922	$uri = "ftp://$uri";	1908	$uri = "ftp://$uri";
1923	}	1909	}
1924	if ($uri =~ /\@/) {	1910	elsif ($uri =~ /^www\d{0,2}\./i) {
1925	push (@uris, $uri);	1911	$uri = "http://$uri";
		1912	}
		1913	elsif ($uri =~ /\@/) {
1926	$uri = "mailto:$uri";	1914	$uri = "mailto:$uri";
1927	}	1915	}
1928	else # if ($uri =~ /^www\d*\./i)	1916	else {
1929	{
1930	# some spammers are using unschemed URIs to escape filters	1917	# some spammers are using unschemed URIs to escape filters
1931	push (@uris, $uri);	1918	$rblonly = 1; # flag that this is a URI that MUAs don't linkify so only use for RBLs
1932	$uri = "http://$uri";	1919	$uri = "http://$uri";
1933	}	1920	}
1934	}	1921	}
1935		1922
1936	# warn("uri: got URI: $uri\n");	1923	if ($uri =~ /^mailto:/) {
1937	push @uris, $uri;	1924	# skip a mail link that does not have a valid TLD or other than one @ after decoding any URLEncoded characters
1938	}	1925	$uri = Mail::SpamAssassin::Util::url_encode($uri) if ($uri =~ /\%(?:2[1-9a-fA-F]\|[3-6][0-9a-fA-f]\|7[0-9a-eA-E])/);
1939	while (/($Addr_spec_re)/igo) {	1926	next if ($uri !~ /^[^@]+@[^@]+$/);
1940	my $uri = $1;	1927	my $domuri = Mail::SpamAssassin::Util::uri_to_domain($uri);
		1928	next unless $domuri;
		1929	push (@uris, $rawuri);
		1930	push (@uris, $uri) unless ($rawuri eq $uri);
		1931	}
1941		1932
1942	# skip mismatches from email address regular expression	1933	next unless ($uri =~/^(?:https?\|ftp):/); # at this point only valid if one or the other of these
1943	next unless $uri =~ /\.${tldsRE}\W*$/io; # skip non-TLDs
1944		1934
1945	$uri =~ s/\s\@\s/@/; # remove spaces around the '@'	1935	my @tmp = Mail::SpamAssassin::Util::uri_list_canonify($redirector_patterns, $uri);
1946	$uri = "mailto:$uri"; # prepend mailto:	1936	my $goodurifound = 0;
1947		1937	foreach my $cleanuri (@tmp) {
1948	#warn("uri: got URI: $uri\n");	1938	my $domain = Mail::SpamAssassin::Util::uri_to_domain($cleanuri);
1949	push @uris, $uri;	1939	if ($domain) {
		1940	# bug 5780: Stop after domain to avoid FP, but do that after all deobfuscation of urlencoding and redirection
		1941	$cleanuri =~ s/^(https?:\/\/[^:\/]+).*$/$1/ if $rblonly;
		1942	push (@uris, $cleanuri);
		1943	$goodurifound = 1;
		1944	}
		1945	}
		1946	next unless $goodurifound;
		1947	push @uris, $rawuri unless $rblonly;
1950	}	1948	}
1951	}	1949	}
1952		1950

Lines 24-30 Link Here

(-)t/uri_text.t (-29 / +42 lines)
24	use vars qw(%patterns %anti_patterns);	24	use vars qw(%patterns %anti_patterns);
25		25
26	# settings	26	# settings
27	plan tests => 104;	27	plan tests => 134;
28		28
29	# initialize SpamAssassin	29	# initialize SpamAssassin
30	my $sa = create_saobj({'dont_copy_prefs' => 1});	30	my $sa = create_saobj({'dont_copy_prefs' => 1});
Lines 133-143 Link Here
133	.www.kuiH5sai.com www.kuiH5sai.com	133	.www.kuiH5sai.com www.kuiH5sai.com
134		134
135	a=www.zaiNgoo7.com www.zaiNgoo7.com	135	a=www.zaiNgoo7.com www.zaiNgoo7.com
136	b@www.vohWais0.com mailto:b@www.vohWais0.com !http://www.vohWais0.com	136	b@www.vohWais0.com mailto:b@www.vohWais0.com !http://www.vohWais0.com
137	c.www.moSaoga8.com www.moSaoga8.com	137	c.www.moSaoga8.com www.moSaoga8.com
138		138
139	foo @ cae8kaip.com mailto:foo@cae8kaip.com
140	xyz..geifoza0.com !geifoza0	139	xyz..geifoza0.com !geifoza0
		140	xyz.geifoza1.com/..xyz xyz.geifoza1.com !xyz.geifoza1.com/..xyz
		141	xyz.geifoza2.CoM xyz.geifoza2.CoM
		142	http://xyz..geifoza3.com !geifoza3
		143	http://xyz.geifoza4.com/..xyz xyz.geifoza4.com/..xyz
		144	http://xyz.geifoza5.CoM xyz.geifoza5.CoM
141		145
142	joe@koja3fui.koja3fui !koja3fui	146	joe@koja3fui.koja3fui !koja3fui
143		147
Lines 219-263 Link Here
219	http://www.luzoop5k.com http://www.luzoop5k.com	223	http://www.luzoop5k.com http://www.luzoop5k.com
220	https://www.luzoop5k.com https://www.luzoop5k.com	224	https://www.luzoop5k.com https://www.luzoop5k.com
221	ftp://www.luzoop5k.com ftp://www.luzoop5k.com	225	ftp://www.luzoop5k.com ftp://www.luzoop5k.com
222	mailto:www.luzoop5k.com mailto:www.luzoop5k.com	226	mailto:www.luzoop5k.com !mailto:www.luzoop5k.com
223	file://www.luzoop5k.com file://www.luzoop5k.com	227	# no longer accept file: scheme
		228	file://www.luzoop5k.com !file://www.luzoop5k.com
224		229
225	# //<user>:<password>@<host>:<port>/<url-path>	230	# //<user>:<password>@<host>:<port>/<url-path>
226	http://user:pass@jiefeet4.com:80/x/y http://user:pass@jiefeet4.com:80/x/y	231	http://user:pass@jiefeet4.com:80/x/y http://user:pass@jiefeet4.com:80/x/y
227		232
228	liy8quei:80 !liy8quei	233	www.liy8quei:80 !liy8quei
229	veibi6cu:443 !veibi6cu	234	www.veibi6cu:443 !veibi6cu
230	puahi8si.com:80 puahi8si.com:80	235	puahi8si.com:80 !puahi8si.com:80
231	chop8tan.com:443 chop8tan.com:443	236	chop8tan.com:443 !chop8tan.com:443
		237	www.puahi9si.com:80 puahi9si.com:80
		238	www.chop9tan.com:443 chop9tan.com:443
232		239
233	ftp://name@su5queib.ca//etc/motd ftp://name@su5queib.ca//etc/motd	240	ftp://name@su5queib.ca//etc/motd ftp://name@su5queib.ca//etc/motd
234	ftp://name@faikaj4t.dom/%2Fetc/motd ftp://name@faikaj4t.dom//etc/motd	241	ftp://name@faikaj4t.dom/%2Fetc/motd !ftp://name@faikaj4t.dom//etc/motd
		242	ftp://name@faikaj4t.com/%2Fetc/motd ftp://name@faikaj4t.com//etc/motd
235		243
236	keyword:sportscar !sportscar	244	keyword:sportscar !sportscar
237		245
238	# questionable tests	246	# questionable tests
		247	mailto://cah3neun@thaihe4d.com mailto://cah3neun@thaihe4d.com
239		248
240	mailto://cah3neun@thaihe4d.com mailto://cah3neun@thaihe4d.com	249	mailto://jicu8vah@another@jicu8vah !jicu8vah@another@jicu8vah
241	mailto://jicu8vah@another@jicu8vah jicu8vah@another@jicu8vah
242	baeb1fai@@example.com !baeb1fai@@example.com	250	baeb1fai@@example.com !baeb1fai@@example.com
		251	mailto://yie6xuna !yie6xuna
		252	mailto://yie6xuna@nottld !yie6xuna@nottld
243		253
244	#mailto://yie6xuna !yie6xuna
245
246	#http://425EE622 http://66.94.230.34
247	#gopher://www.luzoop5k.com gopher://www.luzoop5k.com
248	#nntp://www.luzoop5k.com nntp://www.luzoop5k.com
249	#telnet://www.luzoop5k.com telnet://www.luzoop5k.com
250	#wais://www.luzoop5k.com wais://www.luzoop5k.com
251	#prospero://www.luzoop5k.com prospero://www.luzoop5k.com
252	#nfs://www.luzoop5k.com nfs://www.luzoop5k.com
253	#pop://www.luzoop5k.com pop://www.luzoop5k.com
254	#tel://www.luzoop5k.com tel://www.luzoop5k.com
255	#fax://www.luzoop5k.com fax://www.luzoop5k.com
256	#modem://www.luzoop5k.com modem://www.luzoop5k.com
257	#ldap://www.luzoop5k.com ldap://www.luzoop5k.com
258	#im://www.luzoop5k.com im://www.luzoop5k.com
259	#snmp://www.luzoop5k.com snmp://www.luzoop5k.com
260
261	<sentto-4934-foo=addr.com@verper.com> !^http://.*addr.com@verper.com	254	<sentto-4934-foo=addr.com@verper.com> !^http://.*addr.com@verper.com
262	<sentto-4934-foo=addr.com@verper.com> mailto:sentto-4934-foo=addr.com@verper.com	255	<sentto-4934-foo=addr.com@verper.com> mailto:sentto-4934-foo=addr.com@verper.com
263		256
Lines 265-267 Link Here
265	{ESC}(Bhttp://foo23499.com/ ^http://foo23499.com/$	258	{ESC}(Bhttp://foo23499.com/ ^http://foo23499.com/$
266	http://foo23500.com{ESC}(B/ ^http://foo23500.com(?:/?)$	259	http://foo23500.com{ESC}(B/ ^http://foo23500.com(?:/?)$
267		260
		261	M0"-AE/9Y.KN:_0D2F:95^H*:I,8 !9Y\.KN
		262	>delimtest1.com http://delimtest1.com
		263	<delimtest2.com http://delimtest2.com
		264	"delimtest3.com http://delimtest3.com
		265	\delimtest4.com http://delimtest4.com
		266	'delimtest5.com http://delimtest5.com
		267	`delimtest6.com http://delimtest6.com
		268	,delimtest7.com http://delimtest7.com
		269	{delimtest8.com http://delimtest8.com
		270	[delimtest9.com http://delimtest9.com
		271	(delimtest10.com http://delimtest10.com
		272	\|delimtest11.com http://delimtest11.com
		273	delimtest12.com http://delimtest12.com
		274	ignorethishttp://delimtest13.org http://delimtest13.org
		275	donotignorethiswww.delimtest14.com donotignorethiswww.delimtest14.com
		276	<www.delimtest15.com/foo-~!@#^&()_+=:;'?,.xyz-~!@#^&()_+=:;'?,.> ^http://www.delimtest15.com/foo-~!@#\^&\*\(\)_\+=:;'\?,\.xyz$
		277	.....www.delimtest16.com.......... ^http://www.delimtest16.com$
		278	-----www.delimtest17.com---------- ^http://www.delimtest17.com$
		279	.....http://www.delimtest18.com.......... ^http://www.delimtest18.com$
		280	-----http://www.delimtest19.com---------- ^http://www.delimtest19.com$

Lines 46-52 Link Here

(-)t/uri.t (-1 / +3 lines)
46	ok ($urimap{'http://66.92.69.222/'});	46	ok ($urimap{'http://66.92.69.222/'});
47	ok ($urimap{'http://66.92.69.223/'});	47	ok ($urimap{'http://66.92.69.223/'});
48	ok ($urimap{'http://66.92.69.224/'});	48	ok ($urimap{'http://66.92.69.224/'});
49	ok ($urimap{'spamassassin.org'});	49	ok ($urimap{'http://spamassassin.org'});
50	ok (!$urimap{'CUMSLUTS.'});	50	ok (!$urimap{'CUMSLUTS.'});
51	ok (!$urimap{'CUMSLUTS..VIRGIN'});	51	ok (!$urimap{'CUMSLUTS..VIRGIN'});
52		52
Lines 58-66 Link Here
58		58
59	# undef is valid in some situations, so deal with it...	59	# undef is valid in some situations, so deal with it...
60	if (!defined $expect) {	60	if (!defined $expect) {
		61	warn("try_domains: failed! expect: undefined got: '$result'\n") if (defined $result);
61	return !defined $result;	62	return !defined $result;
62	}	63	}
63	elsif (!defined $result) {	64	elsif (!defined $result) {
		65	warn "try_domains: failed! expect: '$expect' got: undefined\n";
64	return 0;	66	return 0;
65	}	67	}
66		68