Index: lib/Mail/SpamAssassin/Constants.pm =================================================================== --- lib/Mail/SpamAssassin/Constants.pm (revision 1686884) +++ lib/Mail/SpamAssassin/Constants.pm (working copy) @@ -46,6 +46,7 @@ MAX_BODY_LINE_LENGTH MAX_HEADER_KEY_LENGTH MAX_HEADER_VALUE_LENGTH MAX_HEADER_LENGTH ARITH_EXPRESSION_LEXER AI_TIME_UNKNOWN CHARSETS_LIKELY_TO_FP_AS_CAPS MAX_URI_LENGTH + $WHITESPACE_UTF8_RE $ALT_FULLSTOP_UTF8_RE ); %EXPORT_TAGS = ( @@ -391,6 +392,94 @@ [\?:] # ? : Operator )/ox; + +our($WHITESPACE_UTF8_RE, $ALT_FULLSTOP_UTF8_RE); +BEGIN { + # http://en.wikipedia.org/wiki/Whitespace_character + # Unicode property \p{Space} plus a 'ZERO WIDTH SPACE' U+200B + my $wsp_chars = "\x{0009}\x{000A}\x{000B}\x{000C}\x{000D}\x{0020}\x{0085}". + "\x{00A0}\x{1680}\x{2000}\x{2001}\x{2002}\x{2003}\x{2004}". + "\x{2005}\x{2006}\x{2007}\x{2008}\x{2009}\x{200A}\x{200B}". + "\x{2028}\x{2029}\x{202F}\x{205F}\x{3000}"; + my $wsp_bytes = join('|', split(//,$wsp_chars)); utf8::encode($wsp_bytes); + $WHITESPACE_UTF8_RE = qr/$wsp_bytes/so; + + # Bug 6751: + # RFC 3490 (IDNA): Whenever dots are used as label separators, the + # following characters MUST be recognized as dots: U+002E (full stop), + # U+3002 (ideographic full stop), U+FF0E (fullwidth full stop), + # U+FF61 (halfwidth ideographic full stop). + # RFC 5895: [...] the IDEOGRAPHIC FULL STOP character (U+3002) + # can be mapped to the FULL STOP before label separation occurs. + # [...] Only the IDEOGRAPHIC FULL STOP character (U+3002) is added in + # this mapping because the authors have not fully investigated [...] + # Adding also 'SMALL FULL STOP' (U+FE52) as seen in the wild, + # and a 'ONE DOT LEADER' (U+2024). + # + my $dot_chars = "\x{2024}\x{3002}\x{FF0E}\x{FF61}\x{FE52}"; # \x{002E} + my $dot_bytes = join('|', split(//,$dot_chars)); utf8::encode($dot_bytes); + $ALT_FULLSTOP_UTF8_RE = qr/$dot_bytes/so; +} + +# http://en.wikipedia.org/wiki/Whitespace_character +# Unicode property \p{Space} plus a 'ZERO WIDTH SPACE' U+200B +sub InIDNAWhitespace { + return <<'END'; ++\p{Space} ++200B +END +} + +sub InIDNAFullStop { + return <<'END'; +002E +2024 +3002 +FE52 +FF0E +FF61 +END +} + +# http://unicode.org/faq/idn.html IDNA2008, perlunicode(1) man page +sub InIDNA2008 { + return <<'END'; +!utf8::Changes_When_NFKC_Casefolded +-utf8::c +-utf8::z +-utf8::s +-utf8::p +-utf8::nl +-utf8::no +-utf8::me +-utf8::HST=L +-utf8::HST=V +-utf8::HST=V +-utf8::block=Combining_Diacritical_Marks_For_Symbols +-utf8::block=Ancient_Greek_Musical_Notation +-utf8::block=Musical_Symbols +-0640 +-07FA +-302E +-302F +-3031 3035 +-303B ++00B7 ++0375 ++05F3 ++05F4 ++30FB ++002D ++06FD ++06FE ++0F0B ++3007 ++00DF ++03C2 ++utf8::JoinControl +END +} + # ArchiveIterator # if AI doesn't read in the message in the first pass to see if the received Index: lib/Mail/SpamAssassin/PerMsgStatus.pm =================================================================== --- lib/Mail/SpamAssassin/PerMsgStatus.pm (revision 1686884) +++ lib/Mail/SpamAssassin/PerMsgStatus.pm (working copy) @@ -2109,14 +2112,14 @@ # knownscheme regexp looks for either a https?: or ftp: scheme, or www\d*\. or ftp\. prefix, i.e., likely to start a URL # schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri - my $urischemeless = qr/[a-z\d][a-z\d._-]{0,251}\.${tldsRE}\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,251})?/io; - my $uriknownscheme = qr/(?:(?:(?:(?:https?)|(?:ftp)):(?:\/\/)?)|(?:(?:www\d{0,2}|ftp)\.))[^$tbirdenddelim]{1,251}/io; - my $urimailscheme = qr/(?:mailto:)?[^$tbirdenddelimplusat]{1,251}@[^$tbirdenddelimemail]{1,251}/io; + my $urischemeless = qr/[a-z\d][a-z\d._-]{0,251}\.${tldsRE}\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,251}?)?/io; + my $uriknownscheme = qr/(?:(?:(?:(?:https?)|(?:ftp)):(?:\/\/)?)|(?:(?:www\d{0,2}|ftp)\.))[^$tbirdenddelim]{1,251}?/io; + my $urimailscheme = qr/(?:mailto:)?[^$tbirdenddelimplusat]{1,251}@[^$tbirdenddelimemail]{1,251}?/io; $self->{tbirdurire} = qr/(?:\b|(?<=$iso2022shift)|(?<=[$tbirdstartdelim])) - (?:(?:($uriknownscheme)(?=(?:[$tbirdenddelim]|\z))) | - (?:($urimailscheme)(?=(?:[$tbirdenddelimemail]|\z))) | - (?:\b($urischemeless)(?=(?:[$tbirdenddelim]|\z))))/xo; + (?:(?:($uriknownscheme)(?=(?:[$tbirdenddelim]|$WHITESPACE_UTF8_RE|\z))) | + (?:($urimailscheme)(?=(?:[$tbirdenddelimemail]|$WHITESPACE_UTF8_RE|\z))) | + (?:\b($urischemeless)(?=(?:[$tbirdenddelim]|$WHITESPACE_UTF8_RE|\z))))/xo; return $self->{tbirdurire}; } Index: lib/Mail/SpamAssassin/Plugin/HeaderEval.pm =================================================================== --- lib/Mail/SpamAssassin/Plugin/HeaderEval.pm (revision 1686884) +++ lib/Mail/SpamAssassin/Plugin/HeaderEval.pm (working copy) @@ -25,6 +25,7 @@ use Mail::SpamAssassin::Plugin; use Mail::SpamAssassin::Locales; +use Mail::SpamAssassin::Util qw(get_my_locales parse_rfc822_date idn_to_ascii); use Mail::SpamAssassin::Logger; use Mail::SpamAssassin::Constants qw(:sa :ip); @@ -1037,6 +1038,7 @@ if ($to =~ /^([^@]+)@(.+)$/) { my($user,$dom) = ($1,$2); + $dom = idn_to_ascii($dom); $dom = $self->{main}->{registryboundaries}->trim_domain($dom); return unless ($self->{main}->{registryboundaries}->is_domain_valid($dom)); Index: lib/Mail/SpamAssassin/Util.pm =================================================================== --- lib/Mail/SpamAssassin/Util.pm (revision 1686884) +++ lib/Mail/SpamAssassin/Util.pm (working copy) @@ -62,10 +63,12 @@ @EXPORT_OK = qw(&local_tz &base64_decode &untaint_var &untaint_file_path &exit_status_str &proc_status_ok &am_running_on_windows &reverse_ip_address &decode_dns_question_entry - &secure_tmpfile &secure_tmpdir &uri_list_canonicalize); + &secure_tmpfile &secure_tmpdir &uri_list_canonicalize + &get_my_locales &parse_rfc822_date &idn_to_ascii); } use Mail::SpamAssassin; +use Mail::SpamAssassin::Constants qw(:sa); use Config; use IO::Handle; @@ -74,6 +77,7 @@ use Time::Local; use Sys::Hostname (); # don't import hostname() into this namespace! use NetAddr::IP 4.000; +use Scalar::Util qw(tainted); use Fcntl; use Errno qw(ENOENT EACCES EEXIST); use POSIX qw(:sys_wait_h WIFEXITED WIFSIGNALED WIFSTOPPED WEXITSTATUS @@ -96,6 +100,22 @@ ########################################################################### +our $enc_utf8; +BEGIN { + eval { require Encode } + and do { $enc_utf8 = Encode::find_encoding('UTF-8') } +}; + +our $have_libidn; +BEGIN { + eval { require Net::LibIDN } and do { $have_libidn = 1 }; +} + +$have_libidn or warn "INFO: module Net::LibIDN not available,\n". + " internationalized domain names with U-labels will not be recognized!\n"; + +########################################################################### + # find an executable in the current $PATH (or whatever for that platform) { # Show the PATH we're going to explore only once. @@ -338,6 +358,109 @@ ########################################################################### +# returns true if the provided string of octets represents a syntactically +# valid UTF-8 string, otherwise a false is returned +# +sub is_valid_utf_8($) { +# my $octets = $_[0]; + return undef if !defined $_[0]; + # + # RFC 6532: UTF8-non-ascii = UTF8-2 / UTF8-3 / UTF8-4 + # RFC 3629 section 4: Syntax of UTF-8 Byte Sequences + # UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4 + # UTF8-1 = %x00-7F + # UTF8-2 = %xC2-DF UTF8-tail + # UTF8-3 = %xE0 %xA0-BF UTF8-tail / + # %xE1-EC 2( UTF8-tail ) / + # %xED %x80-9F UTF8-tail / + # # U+D800..U+DFFF are utf16 surrogates, not legal utf8 + # %xEE-EF 2( UTF8-tail ) + # UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / + # %xF1-F3 3( UTF8-tail ) / + # %xF4 %x80-8F 2( UTF8-tail ) + # UTF8-tail = %x80-BF + # + # loose variant: + # [\x00-\x7F] | [\xC0-\xDF][\x80-\xBF] | + # [\xE0-\xEF][\x80-\xBF]{2} | [\xF0-\xF4][\x80-\xBF]{3} + # + $_[0] =~ /^ (?: [\x00-\x7F] | + [\xC2-\xDF] [\x80-\xBF] | + \xE0 [\xA0-\xBF] [\x80-\xBF] | + [\xE1-\xEC] [\x80-\xBF]{2} | + \xED [\x80-\x9F] [\x80-\xBF] | + [\xEE-\xEF] [\x80-\xBF]{2} | + \xF0 [\x90-\xBF] [\x80-\xBF]{2} | + [\xF1-\xF3] [\x80-\xBF]{3} | + \xF4 [\x80-\x8F] [\x80-\xBF]{2} )* \z/xs ? 1 : 0; +} + +# Given an international domain name with U-labels (UTF-8 or Unicode chars) +# converts it to ASCII-compatible encoding (ACE). If the argument is in +# ASCII (or is an invalid IDN), returns it lowercased but otherwise unchanged. +# The result is always in octets (utf8 flag off) even if the argument was in +# Unicode characters. +# +sub idn_to_ascii($) { + no bytes; + my $s = $_[0]; + return undef if !defined $s; + # propagate taintedness of the argument, but not its utf8 flag + my $t = tainted($s); # taintedness of the argument + $t = untaint_var($t) if $t; + # leave octets unchanged (not necessarily valid UTF-8), encode chars to UTF-8 + utf8::encode($s) if utf8::is_utf8($s); + if ($s !~ tr/\x00-\x7F//c) { # is all-ASCII (including IP address literal) + $s = lc $s; + } elsif (!is_valid_utf_8($s)) { + info("util: idn_to_ascii: not valid UTF-8 (%d): /%s/", $t, $s); + $s = lc $s; # garbage-in / garbage-out + } else { + my $chars; + if (eval { $chars = $enc_utf8->decode($s,1|8); 1 }) { + $chars =~ s/\p{Mail::SpamAssassin::Constants::InIDNAFullStop}/./gso; + local $1; + if (lc($chars) =~ /([.\p{Mail::SpamAssassin::Constants::InIDNA2008}]+)/) { + $chars = $1; + utf8::encode($chars); + if ($chars ne $s) { + info("util: idn_to_ascii: extracted: /%s/ -> /%s/", $s, $chars); + $s = $chars; + } + } + } else { + # RFC 3490 (IDNA): Whenever dots are used as label separators, the + # following characters MUST be recognized as dots: U+002E (full stop), + # U+3002 (ideographic full stop), U+FF0E (fullwidth full stop), + # U+FF61 (halfwidth ideographic full stop). + $s =~ s/$ALT_FULLSTOP_UTF8_RE/./gso; + # trim whitespace +my $ssv = $s; + $s =~ s/^$WHITESPACE_UTF8_RE+//so; + $s =~ s/$WHITESPACE_UTF8_RE+\z//so; +if ($ssv ne $s) { + info("util: idn_to_ascii: trimmed (%d): /%s/ -> /%s/", $t, $ssv, $s); +} + } + if ($have_libidn) { + # to ASCII-compatible encoding (ACE), lowercased + my $sa = Net::LibIDN::idn_to_ascii($s, 'UTF-8'); + +if (!defined $sa) { + info("util: idn_to_ascii: conversion to ACE failed (%d): /%s/", $t, $s); +} elsif ($sa ne lc $s) { + info("util: idn_to_ascii: converted to ACE (%d): /%s/ -> /%s/", $t, $s, $sa); +} else { + info("util: idn_to_ascii: unchanged (%d): /%s/", $t, $s); +} + $s = $sa if defined $sa; + } + } + $t ? taint_var($s) : $s; # propagate taintedness of the argument +} + +########################################################################### + # map process termination status number to an informative string, and # append optional mesage (dual-valued errno or a string or a number), # returning the resulting string @@ -1314,20 +1439,10 @@ # not required $rest ||= ''; - # Bug 6751: - # RFC 3490 (IDNA): Whenever dots are used as label separators, the - # following characters MUST be recognized as dots: U+002E (full stop), - # U+3002 (ideographic full stop), U+FF0E (fullwidth full stop), - # U+FF61 (halfwidth ideographic full stop). - # RFC 5895: [...] the IDEOGRAPHIC FULL STOP character (U+3002) - # can be mapped to the FULL STOP before label separation occurs. - # [...] Only the IDEOGRAPHIC FULL STOP character (U+3002) is added in - # this mapping because the authors have not fully investigated [...] - # Adding also 'SMALL FULL STOP' (U+FE52) as seen in the wild. - # Parhaps also the 'ONE DOT LEADER' (U+2024). - if ($host =~ s{(?: \xE3\x80\x82 | \xEF\xBC\x8E | \xEF\xBD\xA1 | - \xEF\xB9\x92 | \xE2\x80\xA4 )}{.}xgs) { - push(@nuris, join ('', $proto, $host, $rest)); + my $nhost = idn_to_ascii($host); + if (defined $nhost && $nhost ne lc $host) { + push(@nuris, join('', $proto, $nhost, $rest)); + $host = $nhost; } # bug 4146: deal with non-US ASCII 7-bit chars in the host portion @@ -1334,7 +1449,8 @@ # of the URI according to RFC 1738 that's invalid, and the tested # browsers (Firefox, IE) remove them before usage... if ($host =~ tr/\000-\040\200-\377//d) { - push(@nuris, join ('', $proto, $host, $rest)); + push(@nuris, join ('', $proto, $host, $rest)) + if $host =~ /[^\x00-\x2F\x3A-\x40\x5B-\x60\x7B-\x7F\x80]/; } # deal with http redirectors. strip off one level of redirector @@ -1375,7 +1491,8 @@ # the host portion should end in some form of alpha-numeric, strip off # the rest. if ($host =~ s/[^0-9A-Za-z]+$//) { - push(@nuris, join ('', $proto, $host, $rest)); + push(@nuris, join ('', $proto, $host, $rest)) + if $host =~ /[^\x00-\x2F\x3A-\x40\x5B-\x60\x7B-\x7F\x80]/; } ########################