Attachment #5278 for bug #7133

View | Details | Raw Unified | Return to bug 7133
Collapse All | Expand All

Lines 24-29 Link Here

(-)lib/Mail/SpamAssassin/HTML.pm (-27 / +48 lines)
24	use warnings;	24	use warnings;
25	use re 'taint';	25	use re 'taint';
26		26
		27	require 5.008; # need basic Unicode support for HTML::Parser::utf8_mode
		28	# require 5.008008; # Bug 3787; [perl #37950]: Malformed UTF-8 character ...
		29
27	use HTML::Parser 3.43 ();	30	use HTML::Parser 3.43 ();
28	use Mail::SpamAssassin::Logger;	31	use Mail::SpamAssassin::Logger;
29	use Mail::SpamAssassin::Constants qw(:sa);	32	use Mail::SpamAssassin::Constants qw(:sa);
Lines 86-92 Link Here
86	$ok_attributes{div}{$_} = 1 for qw( style );	89	$ok_attributes{div}{$_} = 1 for qw( style );
87		90
88	sub new {	91	sub new {
89	my ($class) = @_;	92	my ($class, $character_semantics_input, $character_semantics_output) = @_;
90	my $self = $class->SUPER::new(	93	my $self = $class->SUPER::new(
91	api_version => 3,	94	api_version => 3,
92	handlers => [	95	handlers => [
Lines 99-105 Link Here
99	declaration => ["html_declaration", "self,text"],	102	declaration => ["html_declaration", "self,text"],
100	],	103	],
101	marked_sections => 1);	104	marked_sections => 1);
102		105	$self->{SA_character_semantics_input} = $character_semantics_input;
		106	$self->{SA_encode_results} =
		107	$character_semantics_input && !$character_semantics_output;
103	$self;	108	$self;
104	}	109	}
105		110
Lines 125-131 Link Here
125		130
126	my @uri;	131	my @uri;
127		132
128	# add the canonified version of each uri to the detail list	133	# add the canonicalized version of each uri to the detail list
129	if (defined $self->{uri}) {	134	if (defined $self->{uri}) {
130	@uri = keys %{$self->{uri}};	135	@uri = keys %{$self->{uri}};
131	}	136	}
Lines 232-239 Link Here
232	# NOTE: HTML::Parser can cope with: <?xml pis>, <? with space>, so we	237	# NOTE: HTML::Parser can cope with: <?xml pis>, <? with space>, so we
233	# don't need to fix them here.	238	# don't need to fix them here.
234		239
235	# HTML::Parser converts   into a question mark ("?") for some	240	# # (outdated claim) HTML::Parser converts   into a question mark ("?")
236	# reason, so convert them to spaces. Confirmed in 3.31, at least.	241	# # for some reason, so convert them to spaces. Confirmed in 3.31, at least.
		242	# ... Actually it doesn't, it is correctly coverted into Unicode NBSP,
		243	# nevertheless it does not hurt to treat it as a space.
237	$text =~ s/ / /g;	244	$text =~ s/ / /g;
238		245
239	# bug 4695: we want "<br/>" to be treated the same as "<br>", and	246	# bug 4695: we want "<br/>" to be treated the same as "<br>", and
Lines 240-255 Link Here
240	# the HTML::Parser API won't do it for us	247	# the HTML::Parser API won't do it for us
241	$text =~ s/<(\w+)\s*\/>/<$1>/gi;	248	$text =~ s/<(\w+)\s*\/>/<$1>/gi;
242		249
243	# Ignore stupid warning that can't be suppressed: 'Parsing of	250	if (!$self->UNIVERSAL::can('utf8_mode')) {
244	# undecoded UTF-8 will give garbage when decoding entities at ..' (bug 4046)	251	# utf8_mode is cleared by default, only warn if it would need to be set
245	{	252	warn "message: cannot set utf8_mode, module HTML::Parser is too old\n"
246	local $SIG{__WARN__} = sub {	253	if !$self->{SA_character_semantics_input};
247	warn @_ unless (defined $_[0] && $_[0] =~ /^Parsing of undecoded UTF-/);	254	} else {
248	};	255	$self->SUPER::utf8_mode($self->{SA_character_semantics_input} ? 0 : 1);
249		256	dbg("message: HTML::Parser utf8_mode %s",
250	$self->SUPER::parse($text);	257	$self->SUPER::utf8_mode ? "on (assumed UTF-8 octets)"
		258	: "off (default, assumed Unicode characters)");
251	}	259	}
252		260	$self->SUPER::parse($text);
253	$self->SUPER::eof;	261	$self->SUPER::eof;
254		262
255	return $self->{text};	263	return $self->{text};
Lines 257-262 Link Here
257		265
258	sub html_tag {	266	sub html_tag {
259	my ($self, $tag, $attr, $num) = @_;	267	my ($self, $tag, $attr, $num) = @_;
		268	utf8::encode($tag) if $self->{SA_encode_results};
260		269
261	my $maybe_namespace = ($tag =~ m@^(?:o\|st\d):[\w-]+/?$@);	270	my $maybe_namespace = ($tag =~ m@^(?:o\|st\d):[\w-]+/?$@);
262		271
Lines 276-290 Link Here
276		285
277	# ignore non-elements	286	# ignore non-elements
278	if (exists $elements{$tag} \|\| exists $tricks{$tag}) {	287	if (exists $elements{$tag} \|\| exists $tricks{$tag}) {
279	text_style(@_) if exists $elements_text_style{$tag};	288	$self->text_style($tag, $attr, $num) if exists $elements_text_style{$tag};
280		289
281	# bug 5009: things like <p> and </p> both need dealing with	290	# bug 5009: things like <p> and </p> both need dealing with
282	html_whitespace(@_) if exists $elements_whitespace{$tag};	291	$self->html_whitespace($tag) if exists $elements_whitespace{$tag};
283		292
284	# start tags	293	# start tags
285	if ($num == 1) {	294	if ($num == 1) {
286	html_uri(@_) if exists $elements_uri{$tag};	295	$self->html_uri($tag, $attr) if exists $elements_uri{$tag};
287	html_tests(@_);	296	$self->html_tests($tag, $attr, $num);
288	}	297	}
289	# end tags	298	# end tags
290	else {	299	else {
Lines 315-327 Link Here
315	my ($self, $type, $uri) = @_;	324	my ($self, $type, $uri) = @_;
316		325
317	$uri = $self->canon_uri($uri);	326	$uri = $self->canon_uri($uri);
		327	utf8::encode($uri) if $self->{SA_encode_results};
318		328
319	my $target = target_uri($self->{base_href} \|\| "", $uri);	329	my $target = target_uri($self->{base_href} \|\| "", $uri);
320		330
321	# skip things like <iframe src="" ...>	331	# skip things like <iframe src="" ...>
322	if (length $uri) {	332	$self->{uri}->{$uri}->{types}->{$type} = 1 if $uri ne '';
323	$self->{uri}->{$uri}->{types}->{$type} = 1;
324	}
325	}	333	}
326		334
327	sub canon_uri {	335	sub canon_uri {
Lines 382-387 Link Here
382		390
383	# Make sure it ends in a slash	391	# Make sure it ends in a slash
384	$uri .= "/" unless $uri =~ m@/$@;	392	$uri .= "/" unless $uri =~ m@/$@;
		393	utf8::encode($uri) if $self->{SA_encode_results};
385	$self->{base_href} = $uri;	394	$self->{base_href} = $uri;
386	}	395	}
387	}	396	}
Lines 604-610 Link Here
604	}	613	}
605	}	614	}
606	if ($tag eq "img" && exists $self->{inside}{a} && $self->{inside}{a} > 0) {	615	if ($tag eq "img" && exists $self->{inside}{a} && $self->{inside}{a} > 0) {
607	$self->{uri}->{$self->{anchor_last}}->{anchor_text}->[-1] .= "<img>\n";	616	my $uri = $self->{anchor_last};
		617	utf8::encode($uri) if $self->{SA_encode_results};
		618	$self->{uri}->{$uri}->{anchor_text}->[-1] .= "<img>\n";
608	$self->{anchor}->[-1] .= "<img>\n";	619	$self->{anchor}->[-1] .= "<img>\n";
609	}	620	}
610		621
Lines 639-646 Link Here
639		650
640	# special text delimiters - <a> and <title>	651	# special text delimiters - <a> and <title>
641	if ($tag eq "a") {	652	if ($tag eq "a") {
642	$self->{anchor_last} = (exists $attr->{href} ? $self->canon_uri($attr->{href}) : "");	653	my $uri = $self->{anchor_last} =
643	push(@{$self->{uri}->{$self->{anchor_last}}->{anchor_text}}, '');	654	(exists $attr->{href} ? $self->canon_uri($attr->{href}) : "");
		655	utf8::encode($uri) if $self->{SA_encode_results};
		656	push(@{$self->{uri}->{$uri}->{anchor_text}}, '');
644	push(@{$self->{anchor}}, '');	657	push(@{$self->{anchor}}, '');
645	}	658	}
646	if ($tag eq "title") {	659	if ($tag eq "title") {
Lines 681-687 Link Here
681	}	694	}
682	}	695	}
683	else {	696	else {
684	$text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g;	697	# NBSP: UTF-8: C2 A0, ISO-8859-*: A0
		698	$text =~ s/[ \t\n\r\f\x0b]+\|\xc2\xa0/ /gs;
685	# trim leading whitespace if previous element was whitespace	699	# trim leading whitespace if previous element was whitespace
686	# and current element is not invisible	700	# and current element is not invisible
687	if (@{ $self->{text} } && !$display{invisible} &&	701	if (@{ $self->{text} } && !$display{invisible} &&
Lines 701-706 Link Here
701		715
702	sub html_text {	716	sub html_text {
703	my ($self, $text) = @_;	717	my ($self, $text) = @_;
		718	utf8::encode($text) if $self->{SA_encode_results};
704		719
705	# text that is not part of body	720	# text that is not part of body
706	if (exists $self->{inside}{script} && $self->{inside}{script} > 0)	721	if (exists $self->{inside}{script} && $self->{inside}{script} > 0)
Lines 715-721 Link Here
715	# text that is part of body and also stored separately	730	# text that is part of body and also stored separately
716	if (exists $self->{inside}{a} && $self->{inside}{a} > 0) {	731	if (exists $self->{inside}{a} && $self->{inside}{a} > 0) {
717	# this doesn't worry about nested anchors	732	# this doesn't worry about nested anchors
718	$self->{uri}->{$self->{anchor_last}}->{anchor_text}->[-1] .= $text;	733	my $uri = $self->{anchor_last};
		734	utf8::encode($uri) if $self->{SA_encode_results};
		735	$self->{uri}->{$uri}->{anchor_text}->[-1] .= $text;
719	$self->{anchor}->[-1] .= $text;	736	$self->{anchor}->[-1] .= $text;
720	}	737	}
721	if (exists $self->{inside}{title} && $self->{inside}{title} > 0) {	738	if (exists $self->{inside}{title} && $self->{inside}{title} > 0) {
Lines 723-729 Link Here
723	}	740	}
724		741
725	my $invisible_for_bayes = 0;	742	my $invisible_for_bayes = 0;
726	if ($text =~ /[^ \t\n\r\f\x0b\xa0]/) {	743
		744	# NBSP: UTF-8: C2 A0, ISO-8859-*: A0
		745	if ($text !~ /^(?:[ \t\n\r\f\x0b]\|\xc2\xa0)*\z/s) {
727	$invisible_for_bayes = $self->html_font_invisible($text);	746	$invisible_for_bayes = $self->html_font_invisible($text);
728	}	747	}
729		748
Lines 758-763 Link Here
758	# note: $text includes <!-- and -->	777	# note: $text includes <!-- and -->
759	sub html_comment {	778	sub html_comment {
760	my ($self, $text) = @_;	779	my ($self, $text) = @_;
		780	utf8::encode($text) if $self->{SA_encode_results};
761		781
762	push @{ $self->{comment} }, $text;	782	push @{ $self->{comment} }, $text;
763	}	783	}
Lines 764-769 Link Here
764		784
765	sub html_declaration {	785	sub html_declaration {
766	my ($self, $text) = @_;	786	my ($self, $text) = @_;
		787	utf8::encode($text) if $self->{SA_encode_results};
767		788
768	if ($text =~ /^<!doctype/i) {	789	if ($text =~ /^<!doctype/i) {
769	my $tag = "!doctype";	790	my $tag = "!doctype";




    # text/x-aol is ignored here, but looks like text/html ...
    return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i );

    my $text = $self->decode;  # QP and Base64 decoding
    my $raw = length($text);

    # render text/html always, or any other text|text/plain part as text/html
    # based on a heuristic which simulates a certain common mail client
    if ($text ne '' && ($self->{'type'} =~ m{^text/html$}i ||
		        ($self->{'type'} =~ m{^text/plain$}i &&
		         _html_render(substr($text, 0, 23)))))
    {
      $self->{rendered_type} = 'text/html';

      # will input text to HTML::Parser be provided as Unicode characters?
      my $character_semantics = 0;
      if ($self->{normalize} && $enc_utf8) {  # charset decoding requested
        # Provide input to HTML::Parser as Unicode characters
        # which avoids a HTML::Parser bug in utf8_mode
        #   https://rt.cpan.org/Public/Bug/Display.html?id=99755
        # Avoid unnecessary step of encoding->decoding by telling
        # subroutine _normalize() to return Unicode text.  See Bug 7133
        #
        $character_semantics = 1;
        $text = $self->_normalize($text, $self->{charset}, 1);
      } elsif (!defined $self->{charset} ||
               $self->{charset} =~ /^(?:US-ASCII|UTF-8)\z/i) {
        # With some luck input can be interpreted as UTF-8, do not warn.
        # It is still possible to hit the HTML::Parses utf8_mode bug however.
      } else {
        dbg("message: 'normalize_charset' is off, encoding will likely ".
            "be misinterpreted; declared charset: %s", $self->{charset});
      }
      # the 0 requires decoded HTML results to be in bytes (not characters)
      my $html = Mail::SpamAssassin::HTML->new($character_semantics,0); # object

      $html->parse($text);  # parse+render text
      $self->{rendered} = $html->get_rendered_text();
      $self->{visible_rendered} = $html->get_rendered_text(invisible => 0);
      $self->{invisible_rendered} = $html->get_rendered_text(invisible => 1);

      my $space = ($rt =~ tr/ \t\n\r\x0b\xa0/ \t\n\r\x0b\xa0/);
      $r->{html_length} = length($rt);

      my $text_len = length($text);
      $r->{non_space_len} = $r->{html_length} - $space;
      $r->{ratio} = ($text_len - $r->{html_length}) / $text_len;
    }

    else {  # plain text
      if ($self->{normalize} && $enc_utf8) {
        # request transcoded result as UTF-8 octets!
        $text = $self->_normalize($text, $self->{charset}, 0);
      }
      $self->{rendered_type} = $self->{type};
      $self->{rendered} = $self->{'visible_rendered'} = $text;
      $self->{'invisible_rendered'} = '';

    # not possible since the input has already been limited to 'B' and 'Q'
    die "message: unknown encoding type '$cte' in RFC2047 header";
  }
  return $self->_normalize($data, $encoding, 0);  # transcode to UTF-8 octets
}

# Decode base64 and quoted-printable in headers according to RFC2047.

    # Bug 6945: some header fields must not be processed for MIME encoding

  } else {
    local($1,$2,$3);

    # Multiple encoded sections must ignore the interim whitespace.
    # To avoid possible FPs with (\s+(?==\?))?, look for the whole RE

Lines 1142-1148 Link Here

(-)lib/Mail/SpamAssassin/Message.pm (-1 / +2 lines)
1142		1142
1143	# whitespace handling (warning: small changes have large effects!)	1143	# whitespace handling (warning: small changes have large effects!)
1144	$text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed	1144	$text =~ s/\n+\s*\n+/\f/gs; # double newlines => form feed
1145	$text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace (incl. VT, NBSP) => space	1145	# $text =~ tr/ \t\n\r\x0b\xa0/ /s; # whitespace (incl. VT, NBSP) => space
		1146	$text =~ tr/ \t\n\r\x0b/ /s; # whitespace (incl. VT) => space
1146	$text =~ tr/\f/\n/; # form feeds => newline	1147	$text =~ tr/\f/\n/; # form feeds => newline
1147		1148
1148	my @textary = split_into_array_of_short_lines($text);	1149	my @textary = split_into_array_of_short_lines($text);

Lines 21-27 Link Here

(-)t/html_utf8.t (-1 / +2 lines)
21		21
22	tstlocalrules ('	22	tstlocalrules ('
23	body OPPORTUNITY /OPPORTUNITY/	23	body OPPORTUNITY /OPPORTUNITY/
24	body QUOTE_YOUR /\x{201c}Your/	24	# body QUOTE_YOUR /\x{201c}Your/
		25	body QUOTE_YOUR /\xE2\x80\x9CYour/
25	');	26	');
26	sarun ("-L -t < data/spam/009", \&patterns_run_cb);	27	sarun ("-L -t < data/spam/009", \&patterns_run_cb);
27	ok_all_patterns();	28	ok_all_patterns();

Return to bug 7133

Lines 581-599 Link Here

(-)lib/Mail/SpamAssassin/Message/Node.pm (-11 / +37 lines)
581	# text/x-aol is ignored here, but looks like text/html ...	581	# text/x-aol is ignored here, but looks like text/html ...
582	return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain\|html)$/i );	582	return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain\|html)$/i );
583		583
584	my $text = $self->_normalize($self->decode(), $self->{charset});	584	my $text = $self->decode; # QP and Base64 decoding
585	my $raw = length($text);
586		585
587	# render text/html always, or any other text\|text/plain part as text/html	586	# render text/html always, or any other text\|text/plain part as text/html
588	# based on a heuristic which simulates a certain common mail client	587	# based on a heuristic which simulates a certain common mail client
589	if ($raw > 0 && ($self->{'type'} =~ m@^text/html$@i \|\|	588	if ($text ne '' && ($self->{'type'} =~ m{^text/html$}i \|\|
590	($self->{'type'} =~ m@^text/plain$@i &&	589	($self->{'type'} =~ m{^text/plain$}i &&
591	_html_render(substr($text, 0, 23)))))	590	_html_render(substr($text, 0, 23)))))
592	{	591	{
593	$self->{rendered_type} = 'text/html';	592	$self->{rendered_type} = 'text/html';
594		593
595	my $html = Mail::SpamAssassin::HTML->new(); # object	594	# will input text to HTML::Parser be provided as Unicode characters?
596	$html->parse($text); # parse+render text	595	my $character_semantics = 0;
		596	if ($self->{normalize} && $enc_utf8) { # charset decoding requested
		597	# Provide input to HTML::Parser as Unicode characters
		598	# which avoids a HTML::Parser bug in utf8_mode
		599	# https://rt.cpan.org/Public/Bug/Display.html?id=99755
		600	# Avoid unnecessary step of encoding->decoding by telling
		601	# subroutine _normalize() to return Unicode text. See Bug 7133
		602	#
		603	$character_semantics = 1;
		604	$text = $self->_normalize($text, $self->{charset}, 1);
		605	} elsif (!defined $self->{charset} \|\|
		606	$self->{charset} =~ /^(?:US-ASCII\|UTF-8)\z/i) {
		607	# With some luck input can be interpreted as UTF-8, do not warn.
		608	# It is still possible to hit the HTML::Parses utf8_mode bug however.
		609	} else {
		610	dbg("message: 'normalize_charset' is off, encoding will likely ".
		611	"be misinterpreted; declared charset: %s", $self->{charset});
		612	}
		613	# the 0 requires decoded HTML results to be in bytes (not characters)
		614	my $html = Mail::SpamAssassin::HTML->new($character_semantics,0); # object
		615
		616	$html->parse($text); # parse+render text
597	$self->{rendered} = $html->get_rendered_text();	617	$self->{rendered} = $html->get_rendered_text();
598	$self->{visible_rendered} = $html->get_rendered_text(invisible => 0);	618	$self->{visible_rendered} = $html->get_rendered_text(invisible => 0);
599	$self->{invisible_rendered} = $html->get_rendered_text(invisible => 1);	619	$self->{invisible_rendered} = $html->get_rendered_text(invisible => 1);
Lines 607-616 Link Here
607	my $space = ($rt =~ tr/ \t\n\r\x0b\xa0/ \t\n\r\x0b\xa0/);	627	my $space = ($rt =~ tr/ \t\n\r\x0b\xa0/ \t\n\r\x0b\xa0/);
608	$r->{html_length} = length($rt);	628	$r->{html_length} = length($rt);
609		629
		630	my $text_len = length($text);
610	$r->{non_space_len} = $r->{html_length} - $space;	631	$r->{non_space_len} = $r->{html_length} - $space;
611	$r->{ratio} = ($raw - $r->{html_length}) / $raw;	632	$r->{ratio} = ($text_len - $r->{html_length}) / $text_len;
612	}	633	}
613	else {	634
		635	else { # plain text
		636	if ($self->{normalize} && $enc_utf8) {
		637	# request transcoded result as UTF-8 octets!
		638	$text = $self->_normalize($text, $self->{charset}, 0);
		639	}
614	$self->{rendered_type} = $self->{type};	640	$self->{rendered_type} = $self->{type};
615	$self->{rendered} = $self->{'visible_rendered'} = $text;	641	$self->{rendered} = $self->{'visible_rendered'} = $text;
616	$self->{'invisible_rendered'} = '';	642	$self->{'invisible_rendered'} = '';
Lines 732-738 Link Here
732	# not possible since the input has already been limited to 'B' and 'Q'	758	# not possible since the input has already been limited to 'B' and 'Q'
733	die "message: unknown encoding type '$cte' in RFC2047 header";	759	die "message: unknown encoding type '$cte' in RFC2047 header";
734	}	760	}
735	return $self->_normalize($data, $encoding);	761	return $self->_normalize($data, $encoding, 0); # transcode to UTF-8 octets
736	}	762	}
737		763
738	# Decode base64 and quoted-printable in headers according to RFC2047.	764	# Decode base64 and quoted-printable in headers according to RFC2047.
Lines 753-759 Link Here
753	# Bug 6945: some header fields must not be processed for MIME encoding	779	# Bug 6945: some header fields must not be processed for MIME encoding
754		780
755	} else {	781	} else {
756	local($1,$2,$3,$4);	782	local($1,$2,$3);
757		783
758	# Multiple encoded sections must ignore the interim whitespace.	784	# Multiple encoded sections must ignore the interim whitespace.
759	# To avoid possible FPs with (\s+(?==\?))?, look for the whole RE	785	# To avoid possible FPs with (\s+(?==\?))?, look for the whole RE