Index: lib/Mail/SpamAssassin/Constants.pm
===================================================================
--- lib/Mail/SpamAssassin/Constants.pm	(revision 1686884)
+++ lib/Mail/SpamAssassin/Constants.pm	(working copy)
@@ -46,6 +46,7 @@
 	MAX_BODY_LINE_LENGTH MAX_HEADER_KEY_LENGTH MAX_HEADER_VALUE_LENGTH
 	MAX_HEADER_LENGTH ARITH_EXPRESSION_LEXER AI_TIME_UNKNOWN
 	CHARSETS_LIKELY_TO_FP_AS_CAPS MAX_URI_LENGTH
+        $WHITESPACE_UTF8_RE $ALT_FULLSTOP_UTF8_RE
   );
 
   %EXPORT_TAGS = (
@@ -391,6 +392,94 @@
         [\?:]                                   # ? : Operator
       )/ox;
 
+
+our($WHITESPACE_UTF8_RE, $ALT_FULLSTOP_UTF8_RE);
+BEGIN {
+  # http://en.wikipedia.org/wiki/Whitespace_character
+  # Unicode property \p{Space} plus a 'ZERO WIDTH SPACE' U+200B
+  my $wsp_chars = "\x{0009}\x{000A}\x{000B}\x{000C}\x{000D}\x{0020}\x{0085}".
+                  "\x{00A0}\x{1680}\x{2000}\x{2001}\x{2002}\x{2003}\x{2004}".
+                  "\x{2005}\x{2006}\x{2007}\x{2008}\x{2009}\x{200A}\x{200B}".
+                  "\x{2028}\x{2029}\x{202F}\x{205F}\x{3000}";
+  my $wsp_bytes = join('|', split(//,$wsp_chars));  utf8::encode($wsp_bytes);
+  $WHITESPACE_UTF8_RE = qr/$wsp_bytes/so;
+
+  # Bug 6751:
+  # RFC 3490 (IDNA): Whenever dots are used as label separators, the
+  #   following characters MUST be recognized as dots: U+002E (full stop),
+  #   U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
+  #   U+FF61 (halfwidth ideographic full stop).
+  # RFC 5895: [...] the IDEOGRAPHIC FULL STOP character (U+3002)
+  #   can be mapped to the FULL STOP before label separation occurs.
+  #   [...] Only the IDEOGRAPHIC FULL STOP character (U+3002) is added in
+  #   this mapping because the authors have not fully investigated [...]
+  # Adding also 'SMALL FULL STOP' (U+FE52) as seen in the wild,
+  # and a 'ONE DOT LEADER' (U+2024).
+  #
+  my $dot_chars = "\x{2024}\x{3002}\x{FF0E}\x{FF61}\x{FE52}";  # \x{002E}
+  my $dot_bytes = join('|', split(//,$dot_chars));  utf8::encode($dot_bytes);
+  $ALT_FULLSTOP_UTF8_RE = qr/$dot_bytes/so;
+}
+
+# http://en.wikipedia.org/wiki/Whitespace_character
+# Unicode property \p{Space} plus a 'ZERO WIDTH SPACE' U+200B
+sub InIDNAWhitespace {
+  return <<'END';
++\p{Space}
++200B
+END
+}
+
+sub InIDNAFullStop {
+  return <<'END';
+002E
+2024
+3002
+FE52
+FF0E
+FF61
+END
+}
+
+# http://unicode.org/faq/idn.html IDNA2008, perlunicode(1) man page
+sub InIDNA2008 {
+  return <<'END';
+!utf8::Changes_When_NFKC_Casefolded
+-utf8::c
+-utf8::z
+-utf8::s
+-utf8::p
+-utf8::nl
+-utf8::no
+-utf8::me
+-utf8::HST=L
+-utf8::HST=V
+-utf8::HST=V
+-utf8::block=Combining_Diacritical_Marks_For_Symbols
+-utf8::block=Ancient_Greek_Musical_Notation
+-utf8::block=Musical_Symbols
+-0640
+-07FA
+-302E
+-302F
+-3031 3035
+-303B
++00B7
++0375
++05F3
++05F4
++30FB
++002D
++06FD
++06FE
++0F0B
++3007
++00DF
++03C2
++utf8::JoinControl
+END
+}
+
 # ArchiveIterator
 
 # if AI doesn't read in the message in the first pass to see if the received
Index: lib/Mail/SpamAssassin/PerMsgStatus.pm
===================================================================
--- lib/Mail/SpamAssassin/PerMsgStatus.pm	(revision 1686884)
+++ lib/Mail/SpamAssassin/PerMsgStatus.pm	(working copy)
@@ -2109,14 +2112,14 @@
 
   # knownscheme regexp looks for either a https?: or ftp: scheme, or www\d*\. or ftp\. prefix, i.e., likely to start a URL
   # schemeless regexp looks for a valid TLD at the end of what may be a FQDN, followed by optional ., optional :portnum, optional /rest_of_uri
-  my $urischemeless = qr/[a-z\d][a-z\d._-]{0,251}\.${tldsRE}\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,251})?/io;
-  my $uriknownscheme = qr/(?:(?:(?:(?:https?)|(?:ftp)):(?:\/\/)?)|(?:(?:www\d{0,2}|ftp)\.))[^$tbirdenddelim]{1,251}/io;
-  my $urimailscheme = qr/(?:mailto:)?[^$tbirdenddelimplusat]{1,251}@[^$tbirdenddelimemail]{1,251}/io;
+  my $urischemeless = qr/[a-z\d][a-z\d._-]{0,251}\.${tldsRE}\.?(?::\d{1,5})?(?:\/[^$tbirdenddelim]{1,251}?)?/io;
+  my $uriknownscheme = qr/(?:(?:(?:(?:https?)|(?:ftp)):(?:\/\/)?)|(?:(?:www\d{0,2}|ftp)\.))[^$tbirdenddelim]{1,251}?/io;
+  my $urimailscheme = qr/(?:mailto:)?[^$tbirdenddelimplusat]{1,251}@[^$tbirdenddelimemail]{1,251}?/io;
 
   $self->{tbirdurire} = qr/(?:\b|(?<=$iso2022shift)|(?<=[$tbirdstartdelim]))
-                        (?:(?:($uriknownscheme)(?=(?:[$tbirdenddelim]|\z))) |
-                        (?:($urimailscheme)(?=(?:[$tbirdenddelimemail]|\z))) |
-                        (?:\b($urischemeless)(?=(?:[$tbirdenddelim]|\z))))/xo;
+                        (?:(?:($uriknownscheme)(?=(?:[$tbirdenddelim]|$WHITESPACE_UTF8_RE|\z))) |
+                        (?:($urimailscheme)(?=(?:[$tbirdenddelimemail]|$WHITESPACE_UTF8_RE|\z))) |
+                        (?:\b($urischemeless)(?=(?:[$tbirdenddelim]|$WHITESPACE_UTF8_RE|\z))))/xo;
 
   return $self->{tbirdurire};
 }
Index: lib/Mail/SpamAssassin/Plugin/HeaderEval.pm
===================================================================
--- lib/Mail/SpamAssassin/Plugin/HeaderEval.pm	(revision 1686884)
+++ lib/Mail/SpamAssassin/Plugin/HeaderEval.pm	(working copy)
@@ -25,6 +25,7 @@
 
 use Mail::SpamAssassin::Plugin;
 use Mail::SpamAssassin::Locales;
+use Mail::SpamAssassin::Util qw(get_my_locales parse_rfc822_date idn_to_ascii);
 use Mail::SpamAssassin::Logger;
 use Mail::SpamAssassin::Constants qw(:sa :ip);
 
@@ -1037,6 +1038,7 @@
 
   if ($to =~ /^([^@]+)@(.+)$/) {
     my($user,$dom) = ($1,$2);
+    $dom = idn_to_ascii($dom);
     $dom = $self->{main}->{registryboundaries}->trim_domain($dom);
     return unless
         ($self->{main}->{registryboundaries}->is_domain_valid($dom));
Index: lib/Mail/SpamAssassin/Util.pm
===================================================================
--- lib/Mail/SpamAssassin/Util.pm	(revision 1686884)
+++ lib/Mail/SpamAssassin/Util.pm	(working copy)
@@ -62,10 +63,12 @@
   @EXPORT_OK = qw(&local_tz &base64_decode &untaint_var &untaint_file_path
                   &exit_status_str &proc_status_ok &am_running_on_windows
                   &reverse_ip_address &decode_dns_question_entry
-                  &secure_tmpfile &secure_tmpdir &uri_list_canonicalize);
+                  &secure_tmpfile &secure_tmpdir &uri_list_canonicalize
+                  &get_my_locales &parse_rfc822_date &idn_to_ascii);
 }
 
 use Mail::SpamAssassin;
+use Mail::SpamAssassin::Constants qw(:sa);
 
 use Config;
 use IO::Handle;
@@ -74,6 +77,7 @@
 use Time::Local;
 use Sys::Hostname (); # don't import hostname() into this namespace!
 use NetAddr::IP 4.000;
+use Scalar::Util qw(tainted);
 use Fcntl;
 use Errno qw(ENOENT EACCES EEXIST);
 use POSIX qw(:sys_wait_h WIFEXITED WIFSIGNALED WIFSTOPPED WEXITSTATUS
@@ -96,6 +100,22 @@
 
 ###########################################################################
 
+our $enc_utf8;
+BEGIN {
+  eval { require Encode }
+    and do { $enc_utf8 = Encode::find_encoding('UTF-8') }
+};
+
+our $have_libidn;
+BEGIN {
+  eval { require Net::LibIDN } and do { $have_libidn = 1 };
+}
+
+$have_libidn or warn "INFO: module Net::LibIDN not available,\n".
+  "  internationalized domain names with U-labels will not be recognized!\n";
+
+###########################################################################
+
 # find an executable in the current $PATH (or whatever for that platform)
 {
   # Show the PATH we're going to explore only once.
@@ -338,6 +358,109 @@
 
 ###########################################################################
 
+# returns true if the provided string of octets represents a syntactically
+# valid UTF-8 string, otherwise a false is returned
+#
+sub is_valid_utf_8($) {
+# my $octets = $_[0];
+  return undef if !defined $_[0];
+  #
+  # RFC 6532: UTF8-non-ascii = UTF8-2 / UTF8-3 / UTF8-4
+  # RFC 3629 section 4: Syntax of UTF-8 Byte Sequences
+  #   UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
+  #   UTF8-1      = %x00-7F
+  #   UTF8-2      = %xC2-DF UTF8-tail
+  #   UTF8-3      = %xE0 %xA0-BF UTF8-tail /
+  #                 %xE1-EC 2( UTF8-tail ) /
+  #                 %xED %x80-9F UTF8-tail /
+  #                   # U+D800..U+DFFF are utf16 surrogates, not legal utf8
+  #                 %xEE-EF 2( UTF8-tail )
+  #   UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) /
+  #                 %xF1-F3 3( UTF8-tail ) /
+  #                 %xF4 %x80-8F 2( UTF8-tail )
+  #   UTF8-tail   = %x80-BF
+  #
+  # loose variant:
+  #   [\x00-\x7F] | [\xC0-\xDF][\x80-\xBF] |
+  #   [\xE0-\xEF][\x80-\xBF]{2} | [\xF0-\xF4][\x80-\xBF]{3}
+  #
+  $_[0] =~ /^ (?: [\x00-\x7F] |
+                  [\xC2-\xDF] [\x80-\xBF] |
+                  \xE0 [\xA0-\xBF] [\x80-\xBF] |
+                  [\xE1-\xEC] [\x80-\xBF]{2} |
+                  \xED [\x80-\x9F] [\x80-\xBF] |
+                  [\xEE-\xEF] [\x80-\xBF]{2} |
+                  \xF0 [\x90-\xBF] [\x80-\xBF]{2} |
+                  [\xF1-\xF3] [\x80-\xBF]{3} |
+                  \xF4 [\x80-\x8F] [\x80-\xBF]{2} )* \z/xs ? 1 : 0;
+}
+
+# Given an international domain name with U-labels (UTF-8 or Unicode chars)
+# converts it to ASCII-compatible encoding (ACE).  If the argument is in
+# ASCII (or is an invalid IDN), returns it lowercased but otherwise unchanged.
+# The result is always in octets (utf8 flag off) even if the argument was in
+# Unicode characters.
+#
+sub idn_to_ascii($) {
+  no bytes;
+  my $s = $_[0];
+  return undef  if !defined $s;
+  # propagate taintedness of the argument, but not its utf8 flag
+  my $t = tainted($s);  # taintedness of the argument
+  $t = untaint_var($t)  if $t;
+  # leave octets unchanged (not necessarily valid UTF-8), encode chars to UTF-8
+  utf8::encode($s)  if utf8::is_utf8($s);
+  if ($s !~ tr/\x00-\x7F//c) {  # is all-ASCII (including IP address literal)
+    $s = lc $s;
+  } elsif (!is_valid_utf_8($s)) {
+    info("util: idn_to_ascii: not valid UTF-8 (%d): /%s/", $t, $s);
+    $s = lc $s;  # garbage-in / garbage-out
+  } else {
+    my $chars;
+    if (eval { $chars = $enc_utf8->decode($s,1|8); 1 }) {
+      $chars =~ s/\p{Mail::SpamAssassin::Constants::InIDNAFullStop}/./gso;
+      local $1;
+      if (lc($chars) =~ /([.\p{Mail::SpamAssassin::Constants::InIDNA2008}]+)/) {
+        $chars = $1;
+        utf8::encode($chars);
+        if ($chars ne $s) {
+          info("util: idn_to_ascii: extracted: /%s/ -> /%s/", $s, $chars);
+          $s = $chars;
+        }
+      }
+    } else {
+      # RFC 3490 (IDNA): Whenever dots are used as label separators, the
+      # following characters MUST be recognized as dots: U+002E (full stop),
+      # U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
+      # U+FF61 (halfwidth ideographic full stop).
+      $s =~ s/$ALT_FULLSTOP_UTF8_RE/./gso;
+      # trim whitespace
+my $ssv = $s;
+      $s =~ s/^$WHITESPACE_UTF8_RE+//so;
+      $s =~ s/$WHITESPACE_UTF8_RE+\z//so;
+if ($ssv ne $s) {
+      info("util: idn_to_ascii: trimmed (%d): /%s/ -> /%s/", $t, $ssv, $s);
+}
+    }
+    if ($have_libidn) {
+      # to ASCII-compatible encoding (ACE), lowercased
+      my $sa = Net::LibIDN::idn_to_ascii($s, 'UTF-8');
+
+if (!defined $sa) {
+  info("util: idn_to_ascii: conversion to ACE failed (%d): /%s/", $t, $s);
+} elsif ($sa ne lc $s) {
+  info("util: idn_to_ascii: converted to ACE (%d): /%s/ -> /%s/", $t, $s, $sa);
+} else {
+  info("util: idn_to_ascii: unchanged (%d): /%s/", $t, $s);
+}
+      $s = $sa  if defined $sa;
+    }
+  }
+  $t ? taint_var($s) : $s;  # propagate taintedness of the argument
+}
+
+###########################################################################
+
 # map process termination status number to an informative string, and
 # append optional mesage (dual-valued errno or a string or a number),
 # returning the resulting string
@@ -1314,20 +1439,10 @@
       # not required
       $rest ||= '';
 
-      # Bug 6751:
-      # RFC 3490 (IDNA): Whenever dots are used as label separators, the
-      #   following characters MUST be recognized as dots: U+002E (full stop),
-      #   U+3002 (ideographic full stop), U+FF0E (fullwidth full stop),
-      #   U+FF61 (halfwidth ideographic full stop).
-      # RFC 5895: [...] the IDEOGRAPHIC FULL STOP character (U+3002)
-      #   can be mapped to the FULL STOP before label separation occurs.
-      #   [...] Only the IDEOGRAPHIC FULL STOP character (U+3002) is added in
-      #   this mapping because the authors have not fully investigated [...]
-      # Adding also 'SMALL FULL STOP' (U+FE52) as seen in the wild.
-      # Parhaps also the 'ONE DOT LEADER' (U+2024).
-      if ($host =~ s{(?: \xE3\x80\x82 | \xEF\xBC\x8E | \xEF\xBD\xA1 |
-                         \xEF\xB9\x92 | \xE2\x80\xA4 )}{.}xgs) {
-        push(@nuris, join ('', $proto, $host, $rest));
+      my $nhost = idn_to_ascii($host);
+      if (defined $nhost && $nhost ne lc $host) {
+        push(@nuris, join('', $proto, $nhost, $rest));
+        $host = $nhost;
       }
 
       # bug 4146: deal with non-US ASCII 7-bit chars in the host portion
@@ -1334,7 +1449,8 @@
       # of the URI according to RFC 1738 that's invalid, and the tested
       # browsers (Firefox, IE) remove them before usage...
       if ($host =~ tr/\000-\040\200-\377//d) {
-        push(@nuris, join ('', $proto, $host, $rest));
+        push(@nuris, join ('', $proto, $host, $rest))
+          if $host =~ /[^\x00-\x2F\x3A-\x40\x5B-\x60\x7B-\x7F\x80]/;
       }
 
       # deal with http redirectors.  strip off one level of redirector
@@ -1375,7 +1491,8 @@
       # the host portion should end in some form of alpha-numeric, strip off
       # the rest.
       if ($host =~ s/[^0-9A-Za-z]+$//) {
-        push(@nuris, join ('', $proto, $host, $rest));
+        push(@nuris, join ('', $proto, $host, $rest))
+          if $host =~ /[^\x00-\x2F\x3A-\x40\x5B-\x60\x7B-\x7F\x80]/;
       }
 
       ########################