Index: lib/Mail/SpamAssassin/HTML.pm
===================================================================
--- lib/Mail/SpamAssassin/HTML.pm	(revision 1659637)
+++ lib/Mail/SpamAssassin/HTML.pm	(working copy)
@@ -24,6 +24,9 @@
 use warnings;
 use re 'taint';
 
+require 5.008;     # need basic Unicode support for HTML::Parser::utf8_mode
+# require 5.008008;  # Bug 3787; [perl #37950]: Malformed UTF-8 character ...
+
 use HTML::Parser 3.43 ();
 use Mail::SpamAssassin::Logger;
 use Mail::SpamAssassin::Constants qw(:sa);
@@ -86,7 +89,7 @@
 $ok_attributes{div}{$_} = 1 for qw( style );
 
 sub new {
-  my ($class) = @_;
+  my ($class, $character_semantics_input, $character_semantics_output) = @_;
   my $self = $class->SUPER::new(
 		api_version => 3,
 		handlers => [
@@ -99,7 +102,9 @@
 			declaration => ["html_declaration", "self,text"],
 		],
 		marked_sections => 1);
-
+  $self->{SA_character_semantics_input} = $character_semantics_input;
+  $self->{SA_encode_results} =
+    $character_semantics_input && !$character_semantics_output;
   $self;
 }
 
@@ -125,7 +130,7 @@
 
   my @uri;
 
-  # add the canonified version of each uri to the detail list
+  # add the canonicalized version of each uri to the detail list
   if (defined $self->{uri}) {
     @uri = keys %{$self->{uri}};
   }
@@ -232,8 +237,10 @@
   # NOTE: HTML::Parser can cope with: <?xml pis>, <? with space>, so we
   # don't need to fix them here.
 
-  # HTML::Parser converts &nbsp; into a question mark ("?") for some
-  # reason, so convert them to spaces.  Confirmed in 3.31, at least.
+  # # (outdated claim) HTML::Parser converts &nbsp; into a question mark ("?")
+  # # for some reason, so convert them to spaces.  Confirmed in 3.31, at least.
+  # ... Actually it doesn't, it is correctly coverted into Unicode NBSP,
+  # nevertheless it does not hurt to treat it as a space.
   $text =~ s/&nbsp;/ /g;
 
   # bug 4695: we want "<br/>" to be treated the same as "<br>", and
@@ -240,16 +247,17 @@
   # the HTML::Parser API won't do it for us
   $text =~ s/<(\w+)\s*\/>/<$1>/gi;
 
-  # Ignore stupid warning that can't be suppressed: 'Parsing of
-  # undecoded UTF-8 will give garbage when decoding entities at ..' (bug 4046)
-  {
-    local $SIG{__WARN__} = sub {
-      warn @_ unless (defined $_[0] && $_[0] =~ /^Parsing of undecoded UTF-/);
-    };
-
-    $self->SUPER::parse($text);
+  if (!$self->UNIVERSAL::can('utf8_mode')) {
+    # utf8_mode is cleared by default, only warn if it would need to be set
+    warn "message: cannot set utf8_mode, module HTML::Parser is too old\n"
+      if !$self->{SA_character_semantics_input};
+  } else {
+    $self->SUPER::utf8_mode($self->{SA_character_semantics_input} ? 0 : 1);
+    dbg("message: HTML::Parser utf8_mode %s",
+        $self->SUPER::utf8_mode ? "on (assumed UTF-8 octets)"
+                                : "off (default, assumed Unicode characters)");
   }
-
+  $self->SUPER::parse($text);
   $self->SUPER::eof;
 
   return $self->{text};
@@ -257,6 +265,7 @@
 
 sub html_tag {
   my ($self, $tag, $attr, $num) = @_;
+  utf8::encode($tag) if $self->{SA_encode_results};
 
   my $maybe_namespace = ($tag =~ m@^(?:o|st\d):[\w-]+/?$@);
 
@@ -276,15 +285,15 @@
 
   # ignore non-elements
   if (exists $elements{$tag} || exists $tricks{$tag}) {
-    text_style(@_) if exists $elements_text_style{$tag};
+    $self->text_style($tag, $attr, $num) if exists $elements_text_style{$tag};
 
     # bug 5009: things like <p> and </p> both need dealing with
-    html_whitespace(@_) if exists $elements_whitespace{$tag};
+    $self->html_whitespace($tag) if exists $elements_whitespace{$tag};
 
     # start tags
     if ($num == 1) {
-      html_uri(@_) if exists $elements_uri{$tag};
-      html_tests(@_);
+      $self->html_uri($tag, $attr) if exists $elements_uri{$tag};
+      $self->html_tests($tag, $attr, $num);
     }
     # end tags
     else {
@@ -315,13 +324,12 @@
   my ($self, $type, $uri) = @_;
 
   $uri = $self->canon_uri($uri);
+  utf8::encode($uri) if $self->{SA_encode_results};
 
   my $target = target_uri($self->{base_href} || "", $uri);
 
   # skip things like <iframe src="" ...>
-  if (length $uri) {
-    $self->{uri}->{$uri}->{types}->{$type} = 1;
-  }
+  $self->{uri}->{$uri}->{types}->{$type} = 1  if $uri ne '';
 }
 
 sub canon_uri {
@@ -382,6 +390,7 @@
 
 	# Make sure it ends in a slash
 	$uri .= "/" unless $uri =~ m@/$@;
+        utf8::encode($uri) if $self->{SA_encode_results};
 	$self->{base_href} = $uri;
       }
     }
@@ -604,7 +613,9 @@
     }
   }
   if ($tag eq "img" && exists $self->{inside}{a} && $self->{inside}{a} > 0) {
-    $self->{uri}->{$self->{anchor_last}}->{anchor_text}->[-1] .= "<img>\n";
+    my $uri = $self->{anchor_last};
+    utf8::encode($uri) if $self->{SA_encode_results};
+    $self->{uri}->{$uri}->{anchor_text}->[-1] .= "<img>\n";
     $self->{anchor}->[-1] .= "<img>\n";
   }
 
@@ -639,8 +650,10 @@
 
   # special text delimiters - <a> and <title>
   if ($tag eq "a") {
-    $self->{anchor_last} = (exists $attr->{href} ? $self->canon_uri($attr->{href}) : "");
-    push(@{$self->{uri}->{$self->{anchor_last}}->{anchor_text}}, '');
+    my $uri = $self->{anchor_last} =
+      (exists $attr->{href} ? $self->canon_uri($attr->{href}) : "");
+    utf8::encode($uri) if $self->{SA_encode_results};
+    push(@{$self->{uri}->{$uri}->{anchor_text}}, '');
     push(@{$self->{anchor}}, '');
   }
   if ($tag eq "title") {
@@ -681,7 +694,8 @@
     }
   }
   else {
-    $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g;
+    # NBSP:  UTF-8: C2 A0, ISO-8859-*: A0
+    $text =~ s/[ \t\n\r\f\x0b]+|\xc2\xa0/ /gs;
     # trim leading whitespace if previous element was whitespace 
     # and current element is not invisible
     if (@{ $self->{text} } && !$display{invisible} &&
@@ -701,6 +715,7 @@
 
 sub html_text {
   my ($self, $text) = @_;
+  utf8::encode($text) if $self->{SA_encode_results};
 
   # text that is not part of body
   if (exists $self->{inside}{script} && $self->{inside}{script} > 0)
@@ -715,7 +730,9 @@
   # text that is part of body and also stored separately
   if (exists $self->{inside}{a} && $self->{inside}{a} > 0) {
     # this doesn't worry about nested anchors
-    $self->{uri}->{$self->{anchor_last}}->{anchor_text}->[-1] .= $text;
+    my $uri = $self->{anchor_last};
+    utf8::encode($uri) if $self->{SA_encode_results};
+    $self->{uri}->{$uri}->{anchor_text}->[-1] .= $text;
     $self->{anchor}->[-1] .= $text;
   }
   if (exists $self->{inside}{title} && $self->{inside}{title} > 0) {
@@ -723,7 +740,9 @@
   }
 
   my $invisible_for_bayes = 0;
-  if ($text =~ /[^ \t\n\r\f\x0b\xa0]/) {
+
+  # NBSP:  UTF-8: C2 A0, ISO-8859-*: A0
+  if ($text !~ /^(?:[ \t\n\r\f\x0b]|\xc2\xa0)*\z/s) {
     $invisible_for_bayes = $self->html_font_invisible($text);
   }
 
@@ -758,6 +777,7 @@
 # note: $text includes <!-- and -->
 sub html_comment {
   my ($self, $text) = @_;
+  utf8::encode($text) if $self->{SA_encode_results};
 
   push @{ $self->{comment} }, $text;
 }
@@ -764,6 +784,7 @@
 
 sub html_declaration {
   my ($self, $text) = @_;
+  utf8::encode($text) if $self->{SA_encode_results};
 
   if ($text =~ /^<!doctype/i) {
     my $tag = "!doctype";
Index: lib/Mail/SpamAssassin/Message/Node.pm
===================================================================
--- lib/Mail/SpamAssassin/Message/Node.pm	(revision 1659637)
+++ lib/Mail/SpamAssassin/Message/Node.pm	(working copy)
@@ -581,19 +581,39 @@
     # text/x-aol is ignored here, but looks like text/html ...
     return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i );
 
-    my $text = $self->_normalize($self->decode(), $self->{charset});
-    my $raw = length($text);
+    my $text = $self->decode;  # QP and Base64 decoding
 
     # render text/html always, or any other text|text/plain part as text/html
     # based on a heuristic which simulates a certain common mail client
-    if ($raw > 0 && ($self->{'type'} =~ m@^text/html$@i ||
-		     ($self->{'type'} =~ m@^text/plain$@i &&
-		      _html_render(substr($text, 0, 23)))))
+    if ($text ne '' && ($self->{'type'} =~ m{^text/html$}i ||
+		        ($self->{'type'} =~ m{^text/plain$}i &&
+		         _html_render(substr($text, 0, 23)))))
     {
       $self->{rendered_type} = 'text/html';
 
-      my $html = Mail::SpamAssassin::HTML->new();	# object
-      $html->parse($text);				# parse+render text
+      # will input text to HTML::Parser be provided as Unicode characters?
+      my $character_semantics = 0;
+      if ($self->{normalize} && $enc_utf8) {  # charset decoding requested
+        # Provide input to HTML::Parser as Unicode characters
+        # which avoids a HTML::Parser bug in utf8_mode
+        #   https://rt.cpan.org/Public/Bug/Display.html?id=99755
+        # Avoid unnecessary step of encoding->decoding by telling
+        # subroutine _normalize() to return Unicode text.  See Bug 7133
+        #
+        $character_semantics = 1;
+        $text = $self->_normalize($text, $self->{charset}, 1);
+      } elsif (!defined $self->{charset} ||
+               $self->{charset} =~ /^(?:US-ASCII|UTF-8)\z/i) {
+        # With some luck input can be interpreted as UTF-8, do not warn.
+        # It is still possible to hit the HTML::Parses utf8_mode bug however.
+      } else {
+        dbg("message: 'normalize_charset' is off, encoding will likely ".
+            "be misinterpreted; declared charset: %s", $self->{charset});
+      }
+      # the 0 requires decoded HTML results to be in bytes (not characters)
+      my $html = Mail::SpamAssassin::HTML->new($character_semantics,0); # object
+
+      $html->parse($text);  # parse+render text
       $self->{rendered} = $html->get_rendered_text();
       $self->{visible_rendered} = $html->get_rendered_text(invisible => 0);
       $self->{invisible_rendered} = $html->get_rendered_text(invisible => 1);
@@ -607,10 +627,16 @@
       my $space = ($rt =~ tr/ \t\n\r\x0b\xa0/ \t\n\r\x0b\xa0/);
       $r->{html_length} = length($rt);
 
+      my $text_len = length($text);
       $r->{non_space_len} = $r->{html_length} - $space;
-      $r->{ratio} = ($raw - $r->{html_length}) / $raw;
+      $r->{ratio} = ($text_len - $r->{html_length}) / $text_len;
     }
-    else {
+
+    else {  # plain text
+      if ($self->{normalize} && $enc_utf8) {
+        # request transcoded result as UTF-8 octets!
+        $text = $self->_normalize($text, $self->{charset}, 0);
+      }
       $self->{rendered_type} = $self->{type};
       $self->{rendered} = $self->{'visible_rendered'} = $text;
       $self->{'invisible_rendered'} = '';
@@ -732,7 +758,7 @@
     # not possible since the input has already been limited to 'B' and 'Q'
     die "message: unknown encoding type '$cte' in RFC2047 header";
   }
-  return $self->_normalize($data, $encoding);
+  return $self->_normalize($data, $encoding, 0);  # transcode to UTF-8 octets
 }
 
 # Decode base64 and quoted-printable in headers according to RFC2047.
@@ -753,7 +779,7 @@
     # Bug 6945: some header fields must not be processed for MIME encoding
 
   } else {
-    local($1,$2,$3,$4);
+    local($1,$2,$3);
 
     # Multiple encoded sections must ignore the interim whitespace.
     # To avoid possible FPs with (\s+(?==\?))?, look for the whole RE
Index: lib/Mail/SpamAssassin/Message.pm
===================================================================
--- lib/Mail/SpamAssassin/Message.pm	(revision 1659637)
+++ lib/Mail/SpamAssassin/Message.pm	(working copy)
@@ -1142,7 +1142,8 @@
 
   # whitespace handling (warning: small changes have large effects!)
   $text =~ s/\n+\s*\n+/\f/gs;		# double newlines => form feed
-  $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace (incl. VT, NBSP) => space
+# $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace (incl. VT, NBSP) => space
+  $text =~ tr/ \t\n\r\x0b/ /s;		# whitespace (incl. VT) => space
   $text =~ tr/\f/\n/;			# form feeds => newline
 
   my @textary = split_into_array_of_short_lines($text);
Index: t/html_utf8.t
===================================================================
--- t/html_utf8.t	(revision 1659637)
+++ t/html_utf8.t	(working copy)
@@ -21,7 +21,8 @@
 
 tstlocalrules ('
 body OPPORTUNITY	/OPPORTUNITY/
-body QUOTE_YOUR /\x{201c}Your/
+# body QUOTE_YOUR /\x{201c}Your/
+body QUOTE_YOUR /\xE2\x80\x9CYour/
 ');
 sarun ("-L -t < data/spam/009", \&patterns_run_cb);
 ok_all_patterns();