View | Details | Raw Unified | Return to bug 7133
Collapse All | Expand All

(-)lib/Mail/SpamAssassin/HTML.pm (-27 / +48 lines)
Lines 24-29 Link Here
24
use warnings;
24
use warnings;
25
use re 'taint';
25
use re 'taint';
26
26
27
require 5.008;     # need basic Unicode support for HTML::Parser::utf8_mode
28
# require 5.008008;  # Bug 3787; [perl #37950]: Malformed UTF-8 character ...
29
27
use HTML::Parser 3.43 ();
30
use HTML::Parser 3.43 ();
28
use Mail::SpamAssassin::Logger;
31
use Mail::SpamAssassin::Logger;
29
use Mail::SpamAssassin::Constants qw(:sa);
32
use Mail::SpamAssassin::Constants qw(:sa);
Lines 86-92 Link Here
86
$ok_attributes{div}{$_} = 1 for qw( style );
89
$ok_attributes{div}{$_} = 1 for qw( style );
87
90
88
sub new {
91
sub new {
89
  my ($class) = @_;
92
  my ($class, $character_semantics_input, $character_semantics_output) = @_;
90
  my $self = $class->SUPER::new(
93
  my $self = $class->SUPER::new(
91
		api_version => 3,
94
		api_version => 3,
92
		handlers => [
95
		handlers => [
Lines 99-105 Link Here
99
			declaration => ["html_declaration", "self,text"],
102
			declaration => ["html_declaration", "self,text"],
100
		],
103
		],
101
		marked_sections => 1);
104
		marked_sections => 1);
102
105
  $self->{SA_character_semantics_input} = $character_semantics_input;
106
  $self->{SA_encode_results} =
107
    $character_semantics_input && !$character_semantics_output;
103
  $self;
108
  $self;
104
}
109
}
105
110
Lines 125-131 Link Here
125
130
126
  my @uri;
131
  my @uri;
127
132
128
  # add the canonified version of each uri to the detail list
133
  # add the canonicalized version of each uri to the detail list
129
  if (defined $self->{uri}) {
134
  if (defined $self->{uri}) {
130
    @uri = keys %{$self->{uri}};
135
    @uri = keys %{$self->{uri}};
131
  }
136
  }
Lines 232-239 Link Here
232
  # NOTE: HTML::Parser can cope with: <?xml pis>, <? with space>, so we
237
  # NOTE: HTML::Parser can cope with: <?xml pis>, <? with space>, so we
233
  # don't need to fix them here.
238
  # don't need to fix them here.
234
239
235
  # HTML::Parser converts &nbsp; into a question mark ("?") for some
240
  # # (outdated claim) HTML::Parser converts &nbsp; into a question mark ("?")
236
  # reason, so convert them to spaces.  Confirmed in 3.31, at least.
241
  # # for some reason, so convert them to spaces.  Confirmed in 3.31, at least.
242
  # ... Actually it doesn't, it is correctly coverted into Unicode NBSP,
243
  # nevertheless it does not hurt to treat it as a space.
237
  $text =~ s/&nbsp;/ /g;
244
  $text =~ s/&nbsp;/ /g;
238
245
239
  # bug 4695: we want "<br/>" to be treated the same as "<br>", and
246
  # bug 4695: we want "<br/>" to be treated the same as "<br>", and
Lines 240-255 Link Here
240
  # the HTML::Parser API won't do it for us
247
  # the HTML::Parser API won't do it for us
241
  $text =~ s/<(\w+)\s*\/>/<$1>/gi;
248
  $text =~ s/<(\w+)\s*\/>/<$1>/gi;
242
249
243
  # Ignore stupid warning that can't be suppressed: 'Parsing of
250
  if (!$self->UNIVERSAL::can('utf8_mode')) {
244
  # undecoded UTF-8 will give garbage when decoding entities at ..' (bug 4046)
251
    # utf8_mode is cleared by default, only warn if it would need to be set
245
  {
252
    warn "message: cannot set utf8_mode, module HTML::Parser is too old\n"
246
    local $SIG{__WARN__} = sub {
253
      if !$self->{SA_character_semantics_input};
247
      warn @_ unless (defined $_[0] && $_[0] =~ /^Parsing of undecoded UTF-/);
254
  } else {
248
    };
255
    $self->SUPER::utf8_mode($self->{SA_character_semantics_input} ? 0 : 1);
249
256
    dbg("message: HTML::Parser utf8_mode %s",
250
    $self->SUPER::parse($text);
257
        $self->SUPER::utf8_mode ? "on (assumed UTF-8 octets)"
258
                                : "off (default, assumed Unicode characters)");
251
  }
259
  }
252
260
  $self->SUPER::parse($text);
253
  $self->SUPER::eof;
261
  $self->SUPER::eof;
254
262
255
  return $self->{text};
263
  return $self->{text};
Lines 257-262 Link Here
257
265
258
sub html_tag {
266
sub html_tag {
259
  my ($self, $tag, $attr, $num) = @_;
267
  my ($self, $tag, $attr, $num) = @_;
268
  utf8::encode($tag) if $self->{SA_encode_results};
260
269
261
  my $maybe_namespace = ($tag =~ m@^(?:o|st\d):[\w-]+/?$@);
270
  my $maybe_namespace = ($tag =~ m@^(?:o|st\d):[\w-]+/?$@);
262
271
Lines 276-290 Link Here
276
285
277
  # ignore non-elements
286
  # ignore non-elements
278
  if (exists $elements{$tag} || exists $tricks{$tag}) {
287
  if (exists $elements{$tag} || exists $tricks{$tag}) {
279
    text_style(@_) if exists $elements_text_style{$tag};
288
    $self->text_style($tag, $attr, $num) if exists $elements_text_style{$tag};
280
289
281
    # bug 5009: things like <p> and </p> both need dealing with
290
    # bug 5009: things like <p> and </p> both need dealing with
282
    html_whitespace(@_) if exists $elements_whitespace{$tag};
291
    $self->html_whitespace($tag) if exists $elements_whitespace{$tag};
283
292
284
    # start tags
293
    # start tags
285
    if ($num == 1) {
294
    if ($num == 1) {
286
      html_uri(@_) if exists $elements_uri{$tag};
295
      $self->html_uri($tag, $attr) if exists $elements_uri{$tag};
287
      html_tests(@_);
296
      $self->html_tests($tag, $attr, $num);
288
    }
297
    }
289
    # end tags
298
    # end tags
290
    else {
299
    else {
Lines 315-327 Link Here
315
  my ($self, $type, $uri) = @_;
324
  my ($self, $type, $uri) = @_;
316
325
317
  $uri = $self->canon_uri($uri);
326
  $uri = $self->canon_uri($uri);
327
  utf8::encode($uri) if $self->{SA_encode_results};
318
328
319
  my $target = target_uri($self->{base_href} || "", $uri);
329
  my $target = target_uri($self->{base_href} || "", $uri);
320
330
321
  # skip things like <iframe src="" ...>
331
  # skip things like <iframe src="" ...>
322
  if (length $uri) {
332
  $self->{uri}->{$uri}->{types}->{$type} = 1  if $uri ne '';
323
    $self->{uri}->{$uri}->{types}->{$type} = 1;
324
  }
325
}
333
}
326
334
327
sub canon_uri {
335
sub canon_uri {
Lines 382-387 Link Here
382
390
383
	# Make sure it ends in a slash
391
	# Make sure it ends in a slash
384
	$uri .= "/" unless $uri =~ m@/$@;
392
	$uri .= "/" unless $uri =~ m@/$@;
393
        utf8::encode($uri) if $self->{SA_encode_results};
385
	$self->{base_href} = $uri;
394
	$self->{base_href} = $uri;
386
      }
395
      }
387
    }
396
    }
Lines 604-610 Link Here
604
    }
613
    }
605
  }
614
  }
606
  if ($tag eq "img" && exists $self->{inside}{a} && $self->{inside}{a} > 0) {
615
  if ($tag eq "img" && exists $self->{inside}{a} && $self->{inside}{a} > 0) {
607
    $self->{uri}->{$self->{anchor_last}}->{anchor_text}->[-1] .= "<img>\n";
616
    my $uri = $self->{anchor_last};
617
    utf8::encode($uri) if $self->{SA_encode_results};
618
    $self->{uri}->{$uri}->{anchor_text}->[-1] .= "<img>\n";
608
    $self->{anchor}->[-1] .= "<img>\n";
619
    $self->{anchor}->[-1] .= "<img>\n";
609
  }
620
  }
610
621
Lines 639-646 Link Here
639
650
640
  # special text delimiters - <a> and <title>
651
  # special text delimiters - <a> and <title>
641
  if ($tag eq "a") {
652
  if ($tag eq "a") {
642
    $self->{anchor_last} = (exists $attr->{href} ? $self->canon_uri($attr->{href}) : "");
653
    my $uri = $self->{anchor_last} =
643
    push(@{$self->{uri}->{$self->{anchor_last}}->{anchor_text}}, '');
654
      (exists $attr->{href} ? $self->canon_uri($attr->{href}) : "");
655
    utf8::encode($uri) if $self->{SA_encode_results};
656
    push(@{$self->{uri}->{$uri}->{anchor_text}}, '');
644
    push(@{$self->{anchor}}, '');
657
    push(@{$self->{anchor}}, '');
645
  }
658
  }
646
  if ($tag eq "title") {
659
  if ($tag eq "title") {
Lines 681-687 Link Here
681
    }
694
    }
682
  }
695
  }
683
  else {
696
  else {
684
    $text =~ s/[ \t\n\r\f\x0b\xa0]+/ /g;
697
    # NBSP:  UTF-8: C2 A0, ISO-8859-*: A0
698
    $text =~ s/[ \t\n\r\f\x0b]+|\xc2\xa0/ /gs;
685
    # trim leading whitespace if previous element was whitespace 
699
    # trim leading whitespace if previous element was whitespace 
686
    # and current element is not invisible
700
    # and current element is not invisible
687
    if (@{ $self->{text} } && !$display{invisible} &&
701
    if (@{ $self->{text} } && !$display{invisible} &&
Lines 701-706 Link Here
701
715
702
sub html_text {
716
sub html_text {
703
  my ($self, $text) = @_;
717
  my ($self, $text) = @_;
718
  utf8::encode($text) if $self->{SA_encode_results};
704
719
705
  # text that is not part of body
720
  # text that is not part of body
706
  if (exists $self->{inside}{script} && $self->{inside}{script} > 0)
721
  if (exists $self->{inside}{script} && $self->{inside}{script} > 0)
Lines 715-721 Link Here
715
  # text that is part of body and also stored separately
730
  # text that is part of body and also stored separately
716
  if (exists $self->{inside}{a} && $self->{inside}{a} > 0) {
731
  if (exists $self->{inside}{a} && $self->{inside}{a} > 0) {
717
    # this doesn't worry about nested anchors
732
    # this doesn't worry about nested anchors
718
    $self->{uri}->{$self->{anchor_last}}->{anchor_text}->[-1] .= $text;
733
    my $uri = $self->{anchor_last};
734
    utf8::encode($uri) if $self->{SA_encode_results};
735
    $self->{uri}->{$uri}->{anchor_text}->[-1] .= $text;
719
    $self->{anchor}->[-1] .= $text;
736
    $self->{anchor}->[-1] .= $text;
720
  }
737
  }
721
  if (exists $self->{inside}{title} && $self->{inside}{title} > 0) {
738
  if (exists $self->{inside}{title} && $self->{inside}{title} > 0) {
Lines 723-729 Link Here
723
  }
740
  }
724
741
725
  my $invisible_for_bayes = 0;
742
  my $invisible_for_bayes = 0;
726
  if ($text =~ /[^ \t\n\r\f\x0b\xa0]/) {
743
744
  # NBSP:  UTF-8: C2 A0, ISO-8859-*: A0
745
  if ($text !~ /^(?:[ \t\n\r\f\x0b]|\xc2\xa0)*\z/s) {
727
    $invisible_for_bayes = $self->html_font_invisible($text);
746
    $invisible_for_bayes = $self->html_font_invisible($text);
728
  }
747
  }
729
748
Lines 758-763 Link Here
758
# note: $text includes <!-- and -->
777
# note: $text includes <!-- and -->
759
sub html_comment {
778
sub html_comment {
760
  my ($self, $text) = @_;
779
  my ($self, $text) = @_;
780
  utf8::encode($text) if $self->{SA_encode_results};
761
781
762
  push @{ $self->{comment} }, $text;
782
  push @{ $self->{comment} }, $text;
763
}
783
}
Lines 764-769 Link Here
764
784
765
sub html_declaration {
785
sub html_declaration {
766
  my ($self, $text) = @_;
786
  my ($self, $text) = @_;
787
  utf8::encode($text) if $self->{SA_encode_results};
767
788
768
  if ($text =~ /^<!doctype/i) {
789
  if ($text =~ /^<!doctype/i) {
769
    my $tag = "!doctype";
790
    my $tag = "!doctype";
(-)lib/Mail/SpamAssassin/Message/Node.pm (-11 / +37 lines)
Lines 581-599 Link Here
581
    # text/x-aol is ignored here, but looks like text/html ...
581
    # text/x-aol is ignored here, but looks like text/html ...
582
    return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i );
582
    return(undef,undef) unless ( $self->{'type'} =~ /^text\/(?:plain|html)$/i );
583
583
584
    my $text = $self->_normalize($self->decode(), $self->{charset});
584
    my $text = $self->decode;  # QP and Base64 decoding
585
    my $raw = length($text);
586
585
587
    # render text/html always, or any other text|text/plain part as text/html
586
    # render text/html always, or any other text|text/plain part as text/html
588
    # based on a heuristic which simulates a certain common mail client
587
    # based on a heuristic which simulates a certain common mail client
589
    if ($raw > 0 && ($self->{'type'} =~ m@^text/html$@i ||
588
    if ($text ne '' && ($self->{'type'} =~ m{^text/html$}i ||
590
		     ($self->{'type'} =~ m@^text/plain$@i &&
589
		        ($self->{'type'} =~ m{^text/plain$}i &&
591
		      _html_render(substr($text, 0, 23)))))
590
		         _html_render(substr($text, 0, 23)))))
592
    {
591
    {
593
      $self->{rendered_type} = 'text/html';
592
      $self->{rendered_type} = 'text/html';
594
593
595
      my $html = Mail::SpamAssassin::HTML->new();	# object
594
      # will input text to HTML::Parser be provided as Unicode characters?
596
      $html->parse($text);				# parse+render text
595
      my $character_semantics = 0;
596
      if ($self->{normalize} && $enc_utf8) {  # charset decoding requested
597
        # Provide input to HTML::Parser as Unicode characters
598
        # which avoids a HTML::Parser bug in utf8_mode
599
        #   https://rt.cpan.org/Public/Bug/Display.html?id=99755
600
        # Avoid unnecessary step of encoding->decoding by telling
601
        # subroutine _normalize() to return Unicode text.  See Bug 7133
602
        #
603
        $character_semantics = 1;
604
        $text = $self->_normalize($text, $self->{charset}, 1);
605
      } elsif (!defined $self->{charset} ||
606
               $self->{charset} =~ /^(?:US-ASCII|UTF-8)\z/i) {
607
        # With some luck input can be interpreted as UTF-8, do not warn.
608
        # It is still possible to hit the HTML::Parses utf8_mode bug however.
609
      } else {
610
        dbg("message: 'normalize_charset' is off, encoding will likely ".
611
            "be misinterpreted; declared charset: %s", $self->{charset});
612
      }
613
      # the 0 requires decoded HTML results to be in bytes (not characters)
614
      my $html = Mail::SpamAssassin::HTML->new($character_semantics,0); # object
615
616
      $html->parse($text);  # parse+render text
597
      $self->{rendered} = $html->get_rendered_text();
617
      $self->{rendered} = $html->get_rendered_text();
598
      $self->{visible_rendered} = $html->get_rendered_text(invisible => 0);
618
      $self->{visible_rendered} = $html->get_rendered_text(invisible => 0);
599
      $self->{invisible_rendered} = $html->get_rendered_text(invisible => 1);
619
      $self->{invisible_rendered} = $html->get_rendered_text(invisible => 1);
Lines 607-616 Link Here
607
      my $space = ($rt =~ tr/ \t\n\r\x0b\xa0/ \t\n\r\x0b\xa0/);
627
      my $space = ($rt =~ tr/ \t\n\r\x0b\xa0/ \t\n\r\x0b\xa0/);
608
      $r->{html_length} = length($rt);
628
      $r->{html_length} = length($rt);
609
629
630
      my $text_len = length($text);
610
      $r->{non_space_len} = $r->{html_length} - $space;
631
      $r->{non_space_len} = $r->{html_length} - $space;
611
      $r->{ratio} = ($raw - $r->{html_length}) / $raw;
632
      $r->{ratio} = ($text_len - $r->{html_length}) / $text_len;
612
    }
633
    }
613
    else {
634
635
    else {  # plain text
636
      if ($self->{normalize} && $enc_utf8) {
637
        # request transcoded result as UTF-8 octets!
638
        $text = $self->_normalize($text, $self->{charset}, 0);
639
      }
614
      $self->{rendered_type} = $self->{type};
640
      $self->{rendered_type} = $self->{type};
615
      $self->{rendered} = $self->{'visible_rendered'} = $text;
641
      $self->{rendered} = $self->{'visible_rendered'} = $text;
616
      $self->{'invisible_rendered'} = '';
642
      $self->{'invisible_rendered'} = '';
Lines 732-738 Link Here
732
    # not possible since the input has already been limited to 'B' and 'Q'
758
    # not possible since the input has already been limited to 'B' and 'Q'
733
    die "message: unknown encoding type '$cte' in RFC2047 header";
759
    die "message: unknown encoding type '$cte' in RFC2047 header";
734
  }
760
  }
735
  return $self->_normalize($data, $encoding);
761
  return $self->_normalize($data, $encoding, 0);  # transcode to UTF-8 octets
736
}
762
}
737
763
738
# Decode base64 and quoted-printable in headers according to RFC2047.
764
# Decode base64 and quoted-printable in headers according to RFC2047.
Lines 753-759 Link Here
753
    # Bug 6945: some header fields must not be processed for MIME encoding
779
    # Bug 6945: some header fields must not be processed for MIME encoding
754
780
755
  } else {
781
  } else {
756
    local($1,$2,$3,$4);
782
    local($1,$2,$3);
757
783
758
    # Multiple encoded sections must ignore the interim whitespace.
784
    # Multiple encoded sections must ignore the interim whitespace.
759
    # To avoid possible FPs with (\s+(?==\?))?, look for the whole RE
785
    # To avoid possible FPs with (\s+(?==\?))?, look for the whole RE
(-)lib/Mail/SpamAssassin/Message.pm (-1 / +2 lines)
Lines 1142-1148 Link Here
1142
1142
1143
  # whitespace handling (warning: small changes have large effects!)
1143
  # whitespace handling (warning: small changes have large effects!)
1144
  $text =~ s/\n+\s*\n+/\f/gs;		# double newlines => form feed
1144
  $text =~ s/\n+\s*\n+/\f/gs;		# double newlines => form feed
1145
  $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace (incl. VT, NBSP) => space
1145
# $text =~ tr/ \t\n\r\x0b\xa0/ /s;	# whitespace (incl. VT, NBSP) => space
1146
  $text =~ tr/ \t\n\r\x0b/ /s;		# whitespace (incl. VT) => space
1146
  $text =~ tr/\f/\n/;			# form feeds => newline
1147
  $text =~ tr/\f/\n/;			# form feeds => newline
1147
1148
1148
  my @textary = split_into_array_of_short_lines($text);
1149
  my @textary = split_into_array_of_short_lines($text);
(-)t/html_utf8.t (-1 / +2 lines)
Lines 21-27 Link Here
21
21
22
tstlocalrules ('
22
tstlocalrules ('
23
body OPPORTUNITY	/OPPORTUNITY/
23
body OPPORTUNITY	/OPPORTUNITY/
24
body QUOTE_YOUR /\x{201c}Your/
24
# body QUOTE_YOUR /\x{201c}Your/
25
body QUOTE_YOUR /\xE2\x80\x9CYour/
25
');
26
');
26
sarun ("-L -t < data/spam/009", \&patterns_run_cb);
27
sarun ("-L -t < data/spam/009", \&patterns_run_cb);
27
ok_all_patterns();
28
ok_all_patterns();

Return to bug 7133