Index: lib/Mail/SpamAssassin/HTML.pm =================================================================== --- lib/Mail/SpamAssassin/HTML.pm (revision 1659637) +++ lib/Mail/SpamAssassin/HTML.pm (working copy) @@ -24,6 +24,9 @@ use warnings; use re 'taint'; +require 5.008; # need basic Unicode support for HTML::Parser::utf8_mode +# require 5.008008; # Bug 3787; [perl #37950]: Malformed UTF-8 character ... + use HTML::Parser 3.43 (); use Mail::SpamAssassin::Logger; use Mail::SpamAssassin::Constants qw(:sa); @@ -86,7 +89,7 @@ $ok_attributes{div}{$_} = 1 for qw( style ); sub new { - my ($class) = @_; + my ($class, $character_semantics_input, $character_semantics_output) = @_; my $self = $class->SUPER::new( api_version => 3, handlers => [ @@ -99,7 +102,9 @@ declaration => ["html_declaration", "self,text"], ], marked_sections => 1); - + $self->{SA_character_semantics_input} = $character_semantics_input; + $self->{SA_encode_results} = + $character_semantics_input && !$character_semantics_output; $self; } @@ -125,7 +130,7 @@ my @uri; - # add the canonified version of each uri to the detail list + # add the canonicalized version of each uri to the detail list if (defined $self->{uri}) { @uri = keys %{$self->{uri}}; } @@ -232,8 +237,10 @@ # NOTE: HTML::Parser can cope with: , , so we # don't need to fix them here. - # HTML::Parser converts   into a question mark ("?") for some - # reason, so convert them to spaces. Confirmed in 3.31, at least. + # # (outdated claim) HTML::Parser converts   into a question mark ("?") + # # for some reason, so convert them to spaces. Confirmed in 3.31, at least. + # ... Actually it doesn't, it is correctly coverted into Unicode NBSP, + # nevertheless it does not hurt to treat it as a space. $text =~ s/ / /g; # bug 4695: we want "
" to be treated the same as "
", and @@ -240,16 +247,17 @@ # the HTML::Parser API won't do it for us $text =~ s/<(\w+)\s*\/>/<$1>/gi; - # Ignore stupid warning that can't be suppressed: 'Parsing of - # undecoded UTF-8 will give garbage when decoding entities at ..' (bug 4046) - { - local $SIG{__WARN__} = sub { - warn @_ unless (defined $_[0] && $_[0] =~ /^Parsing of undecoded UTF-/); - }; - - $self->SUPER::parse($text); + if (!$self->UNIVERSAL::can('utf8_mode')) { + # utf8_mode is cleared by default, only warn if it would need to be set + warn "message: cannot set utf8_mode, module HTML::Parser is too old\n" + if !$self->{SA_character_semantics_input}; + } else { + $self->SUPER::utf8_mode($self->{SA_character_semantics_input} ? 0 : 1); + dbg("message: HTML::Parser utf8_mode %s", + $self->SUPER::utf8_mode ? "on (assumed UTF-8 octets)" + : "off (default, assumed Unicode characters)"); } - + $self->SUPER::parse($text); $self->SUPER::eof; return $self->{text}; @@ -257,6 +265,7 @@ sub html_tag { my ($self, $tag, $attr, $num) = @_; + utf8::encode($tag) if $self->{SA_encode_results}; my $maybe_namespace = ($tag =~ m@^(?:o|st\d):[\w-]+/?$@); @@ -276,15 +285,15 @@ # ignore non-elements if (exists $elements{$tag} || exists $tricks{$tag}) { - text_style(@_) if exists $elements_text_style{$tag}; + $self->text_style($tag, $attr, $num) if exists $elements_text_style{$tag}; # bug 5009: things like

and

both need dealing with - html_whitespace(@_) if exists $elements_whitespace{$tag}; + $self->html_whitespace($tag) if exists $elements_whitespace{$tag}; # start tags if ($num == 1) { - html_uri(@_) if exists $elements_uri{$tag}; - html_tests(@_); + $self->html_uri($tag, $attr) if exists $elements_uri{$tag}; + $self->html_tests($tag, $attr, $num); } # end tags else { @@ -315,13 +324,12 @@ my ($self, $type, $uri) = @_; $uri = $self->canon_uri($uri); + utf8::encode($uri) if $self->{SA_encode_results}; my $target = target_uri($self->{base_href} || "", $uri); # skip things like