Index: lib/Mail/SpamAssassin/Plugin/PDFInfo.pm =================================================================== --- lib/Mail/SpamAssassin/Plugin/PDFInfo.pm (revision 1830725) +++ lib/Mail/SpamAssassin/Plugin/PDFInfo.pm (working copy) @@ -130,6 +130,11 @@ body RULENAME eval:pdf_is_empty_body() bytes: maximum byte count to allow and still consider it empty + pdf_has_uri() + + body RULENAME eval:pdf_has_uri() + tries to detect if there is a linkable uri in pdf body + NOTE: See the ruleset for more examples that are not documented here. =back @@ -174,6 +179,7 @@ $self->register_eval_rule ("pdf_match_details"); $self->register_eval_rule ("pdf_is_encrypted"); $self->register_eval_rule ("pdf_is_empty_body"); + $self->register_eval_rule ("pdf_has_uri"); return $self; } @@ -212,6 +218,7 @@ my $no_more_fuzzy = 0; my $got_image = 0; my $encrypted = 0; + my $has_uri = 0; while($data =~ /([^\n]+)/g) { # dbg("pdfinfo: line=$1"); @@ -239,6 +246,12 @@ # once we hit the first stream, we stop collecting data for fuzzy md5 $no_more_fuzzy = 1 if ($line =~ m/stream/); + # XXX some pdf have uris but are stored inside binary data + if ($line =~ /\/S\s?\/URI\s?\/URI\s?\(([^\)\\]+)\)\s?/) { + dbg("pdfinfo: found URI $1 in pdf " . ($name ? $name : '')); + $has_uri = 1; + } + # From a v1.3 pdf # [12234] dbg: pdfinfo: line=630 0 0 149 0 0 cm # [12234] dbg: pdfinfo: line=/Width 630 @@ -355,6 +368,10 @@ $pms->{pdfinfo}->{details}->{author} = $author; $self->_set_tag($pms, 'PDFAUTHOR', $author); } + if ($has_uri) { + $pms->{pdfinfo}->{has_uri} = $has_uri; + $self->_set_tag($pms, 'PDFURI', $has_uri); + } if ($md5) { $pms->{pdfinfo}->{md5}->{$md5} = 1; $self->_set_tag($pms, 'PDFMD5', $fuzzy_md5); @@ -707,6 +724,23 @@ # ----------------------------------------- +sub pdf_has_uri { + + my ($self,$pms,$body) = @_; + my $has_uri = $pms->{'pdfinfo'}->{'has_uri'}; + + # make sure we have pdf data read in. + if (!exists $pms->{'pdfinfo'}) { + $self->_find_pdf_mime_parts($pms); + } + if ( defined ($has_uri) ) { + return $has_uri; + } + return 0; +} + +# ----------------------------------------- + sub pdf_match_details { my ($self, $pms, $body, $detail, $regex) = @_; return unless ($detail && $regex);