Index: lib/Mail/SpamAssassin/HTML.pm
===================================================================
--- lib/Mail/SpamAssassin/HTML.pm (revision 1659637)
+++ lib/Mail/SpamAssassin/HTML.pm (working copy)
@@ -24,6 +24,9 @@
use warnings;
use re 'taint';
+require 5.008; # need basic Unicode support for HTML::Parser::utf8_mode
+# require 5.008008; # Bug 3787; [perl #37950]: Malformed UTF-8 character ...
+
use HTML::Parser 3.43 ();
use Mail::SpamAssassin::Logger;
use Mail::SpamAssassin::Constants qw(:sa);
@@ -86,7 +89,7 @@
$ok_attributes{div}{$_} = 1 for qw( style );
sub new {
- my ($class) = @_;
+ my ($class, $character_semantics_input, $character_semantics_output) = @_;
my $self = $class->SUPER::new(
api_version => 3,
handlers => [
@@ -99,7 +102,9 @@
declaration => ["html_declaration", "self,text"],
],
marked_sections => 1);
-
+ $self->{SA_character_semantics_input} = $character_semantics_input;
+ $self->{SA_encode_results} =
+ $character_semantics_input && !$character_semantics_output;
$self;
}
@@ -125,7 +130,7 @@
my @uri;
- # add the canonified version of each uri to the detail list
+ # add the canonicalized version of each uri to the detail list
if (defined $self->{uri}) {
@uri = keys %{$self->{uri}};
}
@@ -232,8 +237,10 @@
# NOTE: HTML::Parser can cope with: , with space>, so we
# don't need to fix them here.
- # HTML::Parser converts into a question mark ("?") for some
- # reason, so convert them to spaces. Confirmed in 3.31, at least.
+ # # (outdated claim) HTML::Parser converts into a question mark ("?")
+ # # for some reason, so convert them to spaces. Confirmed in 3.31, at least.
+ # ... Actually it doesn't, it is correctly coverted into Unicode NBSP,
+ # nevertheless it does not hurt to treat it as a space.
$text =~ s/ / /g;
# bug 4695: we want "
" to be treated the same as "
", and
@@ -240,16 +247,17 @@
# the HTML::Parser API won't do it for us
$text =~ s/<(\w+)\s*\/>/<$1>/gi;
- # Ignore stupid warning that can't be suppressed: 'Parsing of
- # undecoded UTF-8 will give garbage when decoding entities at ..' (bug 4046)
- {
- local $SIG{__WARN__} = sub {
- warn @_ unless (defined $_[0] && $_[0] =~ /^Parsing of undecoded UTF-/);
- };
-
- $self->SUPER::parse($text);
+ if (!$self->UNIVERSAL::can('utf8_mode')) {
+ # utf8_mode is cleared by default, only warn if it would need to be set
+ warn "message: cannot set utf8_mode, module HTML::Parser is too old\n"
+ if !$self->{SA_character_semantics_input};
+ } else {
+ $self->SUPER::utf8_mode($self->{SA_character_semantics_input} ? 0 : 1);
+ dbg("message: HTML::Parser utf8_mode %s",
+ $self->SUPER::utf8_mode ? "on (assumed UTF-8 octets)"
+ : "off (default, assumed Unicode characters)");
}
-
+ $self->SUPER::parse($text);
$self->SUPER::eof;
return $self->{text};
@@ -257,6 +265,7 @@
sub html_tag {
my ($self, $tag, $attr, $num) = @_;
+ utf8::encode($tag) if $self->{SA_encode_results};
my $maybe_namespace = ($tag =~ m@^(?:o|st\d):[\w-]+/?$@);
@@ -276,15 +285,15 @@
# ignore non-elements
if (exists $elements{$tag} || exists $tricks{$tag}) {
- text_style(@_) if exists $elements_text_style{$tag};
+ $self->text_style($tag, $attr, $num) if exists $elements_text_style{$tag};
# bug 5009: things like
and
both need dealing with - html_whitespace(@_) if exists $elements_whitespace{$tag}; + $self->html_whitespace($tag) if exists $elements_whitespace{$tag}; # start tags if ($num == 1) { - html_uri(@_) if exists $elements_uri{$tag}; - html_tests(@_); + $self->html_uri($tag, $attr) if exists $elements_uri{$tag}; + $self->html_tests($tag, $attr, $num); } # end tags else { @@ -315,13 +324,12 @@ my ($self, $type, $uri) = @_; $uri = $self->canon_uri($uri); + utf8::encode($uri) if $self->{SA_encode_results}; my $target = target_uri($self->{base_href} || "", $uri); # skip things like