Lines 24-29
Link Here
|
24 |
use warnings; |
24 |
use warnings; |
25 |
use re 'taint'; |
25 |
use re 'taint'; |
26 |
|
26 |
|
|
|
27 |
require 5.008; # need basic Unicode support for HTML::Parser::utf8_mode |
28 |
# require 5.008008; # Bug 3787; [perl #37950]: Malformed UTF-8 character ... |
29 |
|
27 |
use HTML::Parser 3.43 (); |
30 |
use HTML::Parser 3.43 (); |
28 |
use Mail::SpamAssassin::Logger; |
31 |
use Mail::SpamAssassin::Logger; |
29 |
use Mail::SpamAssassin::Constants qw(:sa); |
32 |
use Mail::SpamAssassin::Constants qw(:sa); |
Lines 86-92
Link Here
|
86 |
$ok_attributes{div}{$_} = 1 for qw( style ); |
89 |
$ok_attributes{div}{$_} = 1 for qw( style ); |
87 |
|
90 |
|
88 |
sub new { |
91 |
sub new { |
89 |
my ($class) = @_; |
92 |
my ($class, $character_semantics_input, $character_semantics_output) = @_; |
90 |
my $self = $class->SUPER::new( |
93 |
my $self = $class->SUPER::new( |
91 |
api_version => 3, |
94 |
api_version => 3, |
92 |
handlers => [ |
95 |
handlers => [ |
Lines 99-105
Link Here
|
99 |
declaration => ["html_declaration", "self,text"], |
102 |
declaration => ["html_declaration", "self,text"], |
100 |
], |
103 |
], |
101 |
marked_sections => 1); |
104 |
marked_sections => 1); |
102 |
|
105 |
$self->{SA_character_semantics_input} = $character_semantics_input; |
|
|
106 |
$self->{SA_encode_results} = |
107 |
$character_semantics_input && !$character_semantics_output; |
103 |
$self; |
108 |
$self; |
104 |
} |
109 |
} |
105 |
|
110 |
|
Lines 125-131
Link Here
|
125 |
|
130 |
|
126 |
my @uri; |
131 |
my @uri; |
127 |
|
132 |
|
128 |
# add the canonified version of each uri to the detail list |
133 |
# add the canonicalized version of each uri to the detail list |
129 |
if (defined $self->{uri}) { |
134 |
if (defined $self->{uri}) { |
130 |
@uri = keys %{$self->{uri}}; |
135 |
@uri = keys %{$self->{uri}}; |
131 |
} |
136 |
} |
Lines 232-239
Link Here
|
232 |
# NOTE: HTML::Parser can cope with: <?xml pis>, <? with space>, so we |
237 |
# NOTE: HTML::Parser can cope with: <?xml pis>, <? with space>, so we |
233 |
# don't need to fix them here. |
238 |
# don't need to fix them here. |
234 |
|
239 |
|
235 |
# HTML::Parser converts into a question mark ("?") for some |
240 |
# # (outdated claim) HTML::Parser converts into a question mark ("?") |
236 |
# reason, so convert them to spaces. Confirmed in 3.31, at least. |
241 |
# # for some reason, so convert them to spaces. Confirmed in 3.31, at least. |
|
|
242 |
# ... Actually it doesn't, it is correctly coverted into Unicode NBSP, |
243 |
# nevertheless it does not hurt to treat it as a space. |
237 |
$text =~ s/ / /g; |
244 |
$text =~ s/ / /g; |
238 |
|
245 |
|
239 |
# bug 4695: we want "<br/>" to be treated the same as "<br>", and |
246 |
# bug 4695: we want "<br/>" to be treated the same as "<br>", and |
Lines 240-255
Link Here
|
240 |
# the HTML::Parser API won't do it for us |
247 |
# the HTML::Parser API won't do it for us |
241 |
$text =~ s/<(\w+)\s*\/>/<$1>/gi; |
248 |
$text =~ s/<(\w+)\s*\/>/<$1>/gi; |
242 |
|
249 |
|
243 |
# Ignore stupid warning that can't be suppressed: 'Parsing of |
250 |
if (!$self->UNIVERSAL::can('utf8_mode')) { |
244 |
# undecoded UTF-8 will give garbage when decoding entities at ..' (bug 4046) |
251 |
# utf8_mode is cleared by default, only warn if it would need to be set |
245 |
{ |
252 |
warn "message: cannot set utf8_mode, module HTML::Parser is too old\n" |
246 |
local $SIG{__WARN__} = sub { |
253 |
if !$self->{SA_character_semantics_input}; |
247 |
warn @_ unless (defined $_[0] && $_[0] =~ /^Parsing of undecoded UTF-/); |
254 |
} else { |
248 |
}; |
255 |
$self->SUPER::utf8_mode($self->{SA_character_semantics_input} ? 0 : 1); |
249 |
|
256 |
dbg("message: HTML::Parser utf8_mode %s", |
250 |
$self->SUPER::parse($text); |
257 |
$self->SUPER::utf8_mode ? "on (assumed UTF-8 octets)" |
|
|
258 |
: "off (default, assumed Unicode characters)"); |
251 |
} |
259 |
} |
252 |
|
260 |
$self->SUPER::parse($text); |
253 |
$self->SUPER::eof; |
261 |
$self->SUPER::eof; |
254 |
|
262 |
|
255 |
return $self->{text}; |
263 |
return $self->{text}; |
Lines 257-262
Link Here
|
257 |
|
265 |
|
258 |
sub html_tag { |
266 |
sub html_tag { |
259 |
my ($self, $tag, $attr, $num) = @_; |
267 |
my ($self, $tag, $attr, $num) = @_; |
|
|
268 |
utf8::encode($tag) if $self->{SA_encode_results}; |
260 |
|
269 |
|
261 |
my $maybe_namespace = ($tag =~ m@^(?:o|st\d):[\w-]+/?$@); |
270 |
my $maybe_namespace = ($tag =~ m@^(?:o|st\d):[\w-]+/?$@); |
262 |
|
271 |
|
Lines 276-290
Link Here
|
276 |
|
285 |
|
277 |
# ignore non-elements |
286 |
# ignore non-elements |
278 |
if (exists $elements{$tag} || exists $tricks{$tag}) { |
287 |
if (exists $elements{$tag} || exists $tricks{$tag}) { |
279 |
text_style(@_) if exists $elements_text_style{$tag}; |
288 |
$self->text_style($tag, $attr, $num) if exists $elements_text_style{$tag}; |
280 |
|
289 |
|
281 |
# bug 5009: things like <p> and </p> both need dealing with |
290 |
# bug 5009: things like <p> and </p> both need dealing with |
282 |
html_whitespace(@_) if exists $elements_whitespace{$tag}; |
291 |
$self->html_whitespace($tag) if exists $elements_whitespace{$tag}; |
283 |
|
292 |
|
284 |
# start tags |
293 |
# start tags |
285 |
if ($num == 1) { |
294 |
if ($num == 1) { |
286 |
html_uri(@_) if exists $elements_uri{$tag}; |
295 |
$self->html_uri($tag, $attr) if exists $elements_uri{$tag}; |
287 |
html_tests(@_); |
296 |
$self->html_tests($tag, $attr, $num); |
288 |
} |
297 |
} |
289 |
# end tags |
298 |
# end tags |
290 |
else { |
299 |
else { |
Lines 315-327
Link Here
|
315 |
my ($self, $type, $uri) = @_; |
324 |
my ($self, $type, $uri) = @_; |
316 |
|
325 |
|
317 |
$uri = $self->canon_uri($uri); |
326 |
$uri = $self->canon_uri($uri); |
|
|
327 |
utf8::encode($uri) if $self->{SA_encode_results}; |
318 |
|
328 |
|
319 |
my $target = target_uri($self->{base_href} || "", $uri); |
329 |
my $target = target_uri($self->{base_href} || "", $uri); |
320 |
|
330 |
|
321 |
# skip things like <iframe src="" ...> |
331 |
# skip things like <iframe src="" ...> |
322 |
if (length $uri) { |
332 |
$self->{uri}->{$uri}->{types}->{$type} = 1 if $uri ne ''; |
323 |
$self->{uri}->{$uri}->{types}->{$type} = 1; |
|
|
324 |
} |
325 |
} |
333 |
} |
326 |
|
334 |
|
327 |
sub canon_uri { |
335 |
sub canon_uri { |
Lines 382-387
Link Here
|
382 |
|
390 |
|
383 |
# Make sure it ends in a slash |
391 |
# Make sure it ends in a slash |
384 |
$uri .= "/" unless $uri =~ m@/$@; |
392 |
$uri .= "/" unless $uri =~ m@/$@; |
|
|
393 |
utf8::encode($uri) if $self->{SA_encode_results}; |
385 |
$self->{base_href} = $uri; |
394 |
$self->{base_href} = $uri; |
386 |
} |
395 |
} |
387 |
} |
396 |
} |
Lines 604-610
Link Here
|
604 |
} |
613 |
} |
605 |
} |
614 |
} |
606 |
if ($tag eq "img" && exists $self->{inside}{a} && $self->{inside}{a} > 0) { |
615 |
if ($tag eq "img" && exists $self->{inside}{a} && $self->{inside}{a} > 0) { |
607 |
$self->{uri}->{$self->{anchor_last}}->{anchor_text}->[-1] .= "<img>\n"; |
616 |
my $uri = $self->{anchor_last}; |
|
|
617 |
utf8::encode($uri) if $self->{SA_encode_results}; |
618 |
$self->{uri}->{$uri}->{anchor_text}->[-1] .= "<img>\n"; |
608 |
$self->{anchor}->[-1] .= "<img>\n"; |
619 |
$self->{anchor}->[-1] .= "<img>\n"; |
609 |
} |
620 |
} |
610 |
|
621 |
|
Lines 639-646
Link Here
|
639 |
|
650 |
|
640 |
# special text delimiters - <a> and <title> |
651 |
# special text delimiters - <a> and <title> |
641 |
if ($tag eq "a") { |
652 |
if ($tag eq "a") { |
642 |
$self->{anchor_last} = (exists $attr->{href} ? $self->canon_uri($attr->{href}) : ""); |
653 |
my $uri = $self->{anchor_last} = |
643 |
push(@{$self->{uri}->{$self->{anchor_last}}->{anchor_text}}, ''); |
654 |
(exists $attr->{href} ? $self->canon_uri($attr->{href}) : ""); |
|
|
655 |
utf8::encode($uri) if $self->{SA_encode_results}; |
656 |
push(@{$self->{uri}->{$uri}->{anchor_text}}, ''); |
644 |
push(@{$self->{anchor}}, ''); |
657 |
push(@{$self->{anchor}}, ''); |
645 |
} |
658 |
} |
646 |
if ($tag eq "title") { |
659 |
if ($tag eq "title") { |
Lines 681-687
Link Here
|
681 |
} |
694 |
} |
682 |
} |
695 |
} |
683 |
else { |
696 |
else { |
684 |
$text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g; |
697 |
# NBSP: UTF-8: C2 A0, ISO-8859-*: A0 |
|
|
698 |
$text =~ s/[ \t\n\r\f\x0b]+|\xc2\xa0/ /gs; |
685 |
# trim leading whitespace if previous element was whitespace |
699 |
# trim leading whitespace if previous element was whitespace |
686 |
# and current element is not invisible |
700 |
# and current element is not invisible |
687 |
if (@{ $self->{text} } && !$display{invisible} && |
701 |
if (@{ $self->{text} } && !$display{invisible} && |
Lines 701-706
Link Here
|
701 |
|
715 |
|
702 |
sub html_text { |
716 |
sub html_text { |
703 |
my ($self, $text) = @_; |
717 |
my ($self, $text) = @_; |
|
|
718 |
utf8::encode($text) if $self->{SA_encode_results}; |
704 |
|
719 |
|
705 |
# text that is not part of body |
720 |
# text that is not part of body |
706 |
if (exists $self->{inside}{script} && $self->{inside}{script} > 0) |
721 |
if (exists $self->{inside}{script} && $self->{inside}{script} > 0) |
Lines 715-721
Link Here
|
715 |
# text that is part of body and also stored separately |
730 |
# text that is part of body and also stored separately |
716 |
if (exists $self->{inside}{a} && $self->{inside}{a} > 0) { |
731 |
if (exists $self->{inside}{a} && $self->{inside}{a} > 0) { |
717 |
# this doesn't worry about nested anchors |
732 |
# this doesn't worry about nested anchors |
718 |
$self->{uri}->{$self->{anchor_last}}->{anchor_text}->[-1] .= $text; |
733 |
my $uri = $self->{anchor_last}; |
|
|
734 |
utf8::encode($uri) if $self->{SA_encode_results}; |
735 |
$self->{uri}->{$uri}->{anchor_text}->[-1] .= $text; |
719 |
$self->{anchor}->[-1] .= $text; |
736 |
$self->{anchor}->[-1] .= $text; |
720 |
} |
737 |
} |
721 |
if (exists $self->{inside}{title} && $self->{inside}{title} > 0) { |
738 |
if (exists $self->{inside}{title} && $self->{inside}{title} > 0) { |
Lines 723-729
Link Here
|
723 |
} |
740 |
} |
724 |
|
741 |
|
725 |
my $invisible_for_bayes = 0; |
742 |
my $invisible_for_bayes = 0; |
726 |
if ($text =~ /[^ \t\n\r\f\x0b\xa0]/) { |
743 |
|
|
|
744 |
# NBSP: UTF-8: C2 A0, ISO-8859-*: A0 |
745 |
if ($text !~ /^(?:[ \t\n\r\f\x0b]|\xc2\xa0)*\z/s) { |
727 |
$invisible_for_bayes = $self->html_font_invisible($text); |
746 |
$invisible_for_bayes = $self->html_font_invisible($text); |
728 |
} |
747 |
} |
729 |
|
748 |
|
Lines 758-763
Link Here
|
758 |
# note: $text includes <!-- and --> |
777 |
# note: $text includes <!-- and --> |
759 |
sub html_comment { |
778 |
sub html_comment { |
760 |
my ($self, $text) = @_; |
779 |
my ($self, $text) = @_; |
|
|
780 |
utf8::encode($text) if $self->{SA_encode_results}; |
761 |
|
781 |
|
762 |
push @{ $self->{comment} }, $text; |
782 |
push @{ $self->{comment} }, $text; |
763 |
} |
783 |
} |
Lines 764-769
Link Here
|
764 |
|
784 |
|
765 |
sub html_declaration { |
785 |
sub html_declaration { |
766 |
my ($self, $text) = @_; |
786 |
my ($self, $text) = @_; |
|
|
787 |
utf8::encode($text) if $self->{SA_encode_results}; |
767 |
|
788 |
|
768 |
if ($text =~ /^<!doctype/i) { |
789 |
if ($text =~ /^<!doctype/i) { |
769 |
my $tag = "!doctype"; |
790 |
my $tag = "!doctype"; |