Index: lib/Mail/SpamAssassin/PerMsgStatus.pm =================================================================== --- lib/Mail/SpamAssassin/PerMsgStatus.pm (revision 1832709) +++ lib/Mail/SpamAssassin/PerMsgStatus.pm (working copy) @@ -2363,6 +2363,43 @@ return $detail; } +=item $pms->add_uri_detail_list($pms, $uri) + +Function to add an uri to the pool of uris that will be checked by other plugins + +=cut + +sub add_uri_detail_list { + my ($self, $pms, $uri) = @_; + my $info; + + push @{$pms->{parsed_uri_list}}, $uri; + + $info->{types}->{parsed} = 1; + + $info->{cleaned} = + [Mail::SpamAssassin::Util::uri_list_canonify (undef, $uri)]; + + foreach (@{$info->{cleaned}}) { + my ($dom, $host) = $self->{main}->{registryboundaries}->uri_to_domain($_); + + if ($dom && !$info->{domains}->{$dom}) { + # 3.4 compatibility as per Marc Martinec + if ($host) { + $info->{hosts}->{$host} = $dom; + } + $info->{domains}->{$dom} = 1; + $pms->{uri_domain_count}++; + } + } + + $pms->{uri_detail_list}->{$uri} = $info; + + dbg ('warn: PMS::get_uri_list() appears to have been harvested'), + push @{$pms->{uri_list}}, @{$info->{cleaned}} + if exists $pms->{uri_list}; +} + sub _get_parsed_uri_list { my ($self) = @_; Index: lib/Mail/SpamAssassin/Plugin/PDFInfo.pm =================================================================== --- lib/Mail/SpamAssassin/Plugin/PDFInfo.pm (revision 1832709) +++ lib/Mail/SpamAssassin/Plugin/PDFInfo.pm (working copy) @@ -174,6 +174,7 @@ $self->register_eval_rule ("pdf_match_details"); $self->register_eval_rule ("pdf_is_encrypted"); $self->register_eval_rule ("pdf_is_empty_body"); + $self->register_method_priority ("parsed_metadata", -1); return $self; } @@ -212,6 +213,7 @@ my $no_more_fuzzy = 0; my $got_image = 0; my $encrypted = 0; + my $location = ''; while($data =~ /([^\n]+)/g) { # dbg("pdfinfo: line=$1"); @@ -239,6 +241,13 @@ # once we hit the first stream, we stop collecting data for fuzzy md5 $no_more_fuzzy = 1 if ($line =~ m/stream/); + # XXX some pdf have uris but are stored inside binary data + if ($line =~ /\/S\s?\/URI\s?\/URI\s?\(([^\)\\]+)\)\s?/) { + $location = $1; + dbg("pdfinfo: found URI $location in pdf " . ($name ? $name : '')); + $pms->add_uri_detail_list($pms, $location); + } + # From a v1.3 pdf # [12234] dbg: pdfinfo: line=630 0 0 149 0 0 cm # [12234] dbg: pdfinfo: line=/Width 630 @@ -390,6 +399,21 @@ # ---------------------------------------- +sub parsed_metadata { + my ($self, $opts) = @_; + my $pms = $opts->{permsgstatus}; + + dbg ('warn: get_uri_detail_list() has been called already') + if exists $pms->{uri_detail_list}; + + # make sure we have image data read in. + if (!exists $pms->{'pdfinfo'}) { + $self->_find_pdf_mime_parts($pms); + } +} + +# ---------------------------------------- + sub _find_pdf_mime_parts { my ($self,$pms) = @_;