Line 0
Link Here
|
|
|
1 |
# Author: Steve Freegard <steve.freegard@fsl.com> |
2 |
# Copyright 2016 Steve Freegard |
3 |
# |
4 |
# <@LICENSE> |
5 |
# Licensed to the Apache Software Foundation (ASF) under one or more |
6 |
# contributor license agreements. See the NOTICE file distributed with |
7 |
# this work for additional information regarding copyright ownership. |
8 |
# The ASF licenses this file to you under the Apache License, Version 2.0 |
9 |
# (the "License"); you may not use this file except in compliance with |
10 |
# the License. You may obtain a copy of the License at: |
11 |
# |
12 |
# http://www.apache.org/licenses/LICENSE-2.0 |
13 |
# |
14 |
# Unless required by applicable law or agreed to in writing, software |
15 |
# distributed under the License is distributed on an "AS IS" BASIS, |
16 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
17 |
# See the License for the specific language governing permissions and |
18 |
# limitations under the License. |
19 |
# </@LICENSE> |
20 |
# |
21 |
|
22 |
=head1 NAME |
23 |
|
24 |
HashBL - seearch email addresses in HashBL blocklists |
25 |
|
26 |
=head1 SYNOPSIS |
27 |
|
28 |
loadplugin Mail::SpamAssassin::Plugin::HashBL |
29 |
header HASHBL_EMAIL eval:check_hashbl_emails('ebl.msbl.org') |
30 |
describe HASHBL_EMAIL Message contains email address found on EBL |
31 |
|
32 |
=head1 DESCRIPTION |
33 |
|
34 |
The Email Blocklist (EBL) contains email addresses used to receive responses to spam emails. |
35 |
These email addresses are sometimes called contact email addresses or |
36 |
drop boxes. |
37 |
The initial target of this blocklist was "Nigerian" 419 Advance Fee Fraud spam. As time passed and more types of spam that used drop boxes was identified, |
38 |
these drop boxes also were listed. |
39 |
The EBL now lists significant numbers of drop boxes used in spam sent |
40 |
by Chinese manufacturers of high-tech and light industrial products, |
41 |
SEO/web development companies, direct spam services, list sellers, and a number |
42 |
of fraudulent or outright illegal products sold by botnets. |
43 |
|
44 |
=cut |
45 |
|
46 |
package Mail::SpamAssassin::Plugin::HashBL; |
47 |
use strict; |
48 |
use warnings; |
49 |
my $VERSION = 0.001; |
50 |
|
51 |
use Mail::SpamAssassin::Plugin; |
52 |
use Mail::SpamAssassin::PerMsgStatus; |
53 |
use Mail::SpamAssassin::Util; |
54 |
use Digest::SHA qw(sha1_hex); |
55 |
use Digest::MD5 qw(md5_hex); |
56 |
|
57 |
use vars qw(@ISA $email_whitelist $skip_replyto_envfrom); |
58 |
@ISA = qw(Mail::SpamAssassin::Plugin); |
59 |
|
60 |
sub dbg { Mail::SpamAssassin::Plugin::dbg ("HashBL: @_"); } |
61 |
|
62 |
sub new { |
63 |
my ($class, $mailsa) = @_; |
64 |
|
65 |
$class = ref($class) || $class; |
66 |
my $self = $class->SUPER::new($mailsa); |
67 |
bless ($self, $class); |
68 |
|
69 |
$self->{hashbl_available} = 1; |
70 |
$self->set_config($mailsa->{conf}); |
71 |
$self->register_eval_rule("check_hashbl_emails"); |
72 |
|
73 |
# Need to init the regex here, utilizing registryboundaries->valid_tlds_re |
74 |
# Some regexp tips courtesy of http://www.regular-expressions.info/email.html |
75 |
# full email regex v0.02 |
76 |
$self->{email_regex} = qr/ |
77 |
(?=.{0,64}\@) # limit userpart to 64 chars (and speed up searching?) |
78 |
(?<![a-z0-9!#\$%&'*+\/=?^_`{|}~-]) # start boundary |
79 |
( # capture email |
80 |
[a-z0-9!#\$%&'*+\/=?^_`{|}~-]+ # no dot in beginning |
81 |
(?:\.[a-z0-9!#\$%&'*+\/=?^_`{|}~-]+)* # no consecutive dots, no ending dot |
82 |
\@ |
83 |
(?:[a-z0-9](?:[a-z0-9-]{0,59}[a-z0-9])?\.){1,4} # max 4x61 char parts (should be enough?) |
84 |
$self->{main}->{registryboundaries}->{valid_tlds_re} # ends with valid tld |
85 |
) |
86 |
(?!(?:[a-z0-9-]|\.[a-z0-9])) # make sure domain ends here |
87 |
/xi; |
88 |
|
89 |
return $self; |
90 |
} |
91 |
|
92 |
sub set_config { |
93 |
my ($self, $conf) = @_; |
94 |
my @cmds; |
95 |
} |
96 |
|
97 |
sub parse_config { |
98 |
my ($self, $opts) = @_; |
99 |
return 0; |
100 |
} |
101 |
|
102 |
sub _parse_headers { |
103 |
my ($self, $pms) = @_; |
104 |
|
105 |
if (not defined $pms->{hashbl_email_cache}) { |
106 |
%{$pms->{hashbl_email_cache}{'headers'}} = (); |
107 |
} |
108 |
|
109 |
my @headers = ('EnvelopeFrom', 'Sender', 'From', 'Reply-To'); |
110 |
|
111 |
foreach my $header (@headers) { |
112 |
my $email = $pms->get($header . ':addr'); |
113 |
if ($email) { |
114 |
dbg("Found email $email in header $header"); |
115 |
$pms->{hashbl_email_cache}{'headers'}{$email} = 1; |
116 |
} |
117 |
} |
118 |
|
119 |
return 1; |
120 |
} |
121 |
|
122 |
sub _parse_body { |
123 |
my ($self, $pms) = @_; |
124 |
|
125 |
# Parse body |
126 |
if (not defined $pms->{hashbl_email_cache}) { |
127 |
%{$pms->{hashbl_email_cache}{'body'}} = (); |
128 |
} |
129 |
|
130 |
my %seen; |
131 |
my @body_emails; |
132 |
# get all <a href="mailto:", since they don't show up on stripped_body |
133 |
my $parsed = $pms->get_uri_detail_list(); |
134 |
while (my($uri, $info) = each %{$parsed}) { |
135 |
if (defined $info->{types}->{a} and not defined $info->{types}->{parsed}) { |
136 |
if ($uri =~ /^(?:(?i)mailto):$self->{email_regex}/) { |
137 |
my $email = lc($1); |
138 |
push(@body_emails, $email) unless defined $seen{$email}; |
139 |
$seen{$email} = 1; |
140 |
last if scalar @body_emails >= 20; # sanity |
141 |
} |
142 |
} |
143 |
} |
144 |
# scan stripped normalized body |
145 |
# have to do this way since get_uri_detail_list doesn't know what mails are inside <> |
146 |
my $body = $pms->get_decoded_stripped_body_text_array(); |
147 |
BODY: foreach (@$body) { |
148 |
# strip urls with possible emails inside |
149 |
s#<?https?://\S{0,255}(?:\@|%40)\S{0,255}# #gi; |
150 |
# strip emails contained in <>, not mailto: |
151 |
# also strip ones followed by quote-like "wrote:" (but not fax: and tel: etc) |
152 |
s#<?(?<!mailto:)$self->{email_regex}(?:>|\s{1,10}(?!(?:fa(?:x|csi)|tel|phone|e?-?mail))[a-z]{2,11}:)# #gi; |
153 |
while (/$self->{email_regex}/g) { |
154 |
my $email = lc($1); |
155 |
push(@body_emails, $email) unless defined $seen{$email}; |
156 |
$seen{$email} = 1; |
157 |
last BODY if scalar @body_emails >= 40; # sanity |
158 |
} |
159 |
} |
160 |
foreach my $email (@body_emails) { |
161 |
dbg("Found email $email in body"); |
162 |
$pms->{hashbl_email_cache}{'body'}{$email} = 1; |
163 |
} |
164 |
|
165 |
return 1; |
166 |
} |
167 |
|
168 |
sub _got_hit { |
169 |
my ($self, $pms, $rulename, $email, $desc) = @_; |
170 |
|
171 |
if (defined $pms->{conf}->{descriptions}->{$rulename}) { |
172 |
$desc = $pms->{conf}->{descriptions}->{$rulename}; |
173 |
} |
174 |
|
175 |
$email =~ s/\@/[at]/g; |
176 |
$pms->got_hit($rulename, "", description => $desc." ($email)", ruletype => 'eval'); |
177 |
} |
178 |
|
179 |
sub _submit_email_query { |
180 |
my ($self, $pms, $list, $type, $email) = @_; |
181 |
my $rulename = $pms->get_current_eval_rule_name(); |
182 |
my ($hash, $lookup, $key); |
183 |
if (uc($type) eq 'SHA1') { |
184 |
$hash = sha1_hex($email); |
185 |
} |
186 |
elsif (uc($type) eq 'MD5') { |
187 |
$hash = md5_hex($email); |
188 |
} |
189 |
$lookup = "$hash.$list."; |
190 |
my $obj = { email => $email }; |
191 |
dbg("list: $list, type: $type, email: $email, hash: $hash, lookup: $lookup"); |
192 |
$key = "HASHBL_EMAIL:$lookup"; |
193 |
my $ent = { |
194 |
key => $key, |
195 |
zone => $list, |
196 |
obj => $obj, |
197 |
type => 'HASHBL', |
198 |
rulename => $rulename, |
199 |
}; |
200 |
|
201 |
$ent = $pms->{async}->bgsend_and_start_lookup($lookup, 'A', undef, $ent, sub { |
202 |
my ($ent2, $pkt) = @_; |
203 |
$self->_finish_email_lookup($pms, $ent2, $pkt); |
204 |
}, master_deadline => $pms->{master_deadline} ); |
205 |
|
206 |
return $ent; |
207 |
} |
208 |
|
209 |
sub _finish_email_lookup { |
210 |
my ($self, $pms, $ent, $pkt) = @_; |
211 |
|
212 |
if (!$pkt) { |
213 |
# $pkt will be undef if the DNS query was aborted (e.g. timed out) |
214 |
dbg("_finish_email_lookup aborted: ", |
215 |
$ent->{rulename}, $ent->{key}); |
216 |
return; |
217 |
} |
218 |
|
219 |
my $email = $ent->{obj}->{email}; |
220 |
|
221 |
dbg("_finish_email_lookup: ", $ent->{rulename}, $ent->{key}, $email); |
222 |
|
223 |
my @answer = $pkt->answer; |
224 |
foreach my $rr (@answer) { |
225 |
if ($rr->address =~ /^127\./) { |
226 |
$self->_got_hit($pms, $ent->{rulename}, $email); |
227 |
$pms->register_async_rule_finish($ent->{rulename}); |
228 |
} |
229 |
} |
230 |
} |
231 |
|
232 |
sub check_hashbl_emails { |
233 |
my ($self, $pms, $list, $type) = @_; |
234 |
|
235 |
return 0 unless $self->{hashbl_available}; |
236 |
|
237 |
my $rulename = $pms->get_current_eval_rule_name(); |
238 |
|
239 |
# First we lookup all unique email addresses found in the headers |
240 |
return 0 unless $self->_parse_headers($pms); |
241 |
foreach my $email (keys %{$pms->{hashbl_email_cache}{'headers'}}) { |
242 |
# Remove this from the body hash |
243 |
delete $pms->{hashbl_email_cache}{'body'}{$email}; |
244 |
dbg("HEADER: $email"); |
245 |
$self->_submit_email_query($pms, $list, (($type) ? $type : 'SHA1'), $email); |
246 |
} |
247 |
|
248 |
# Check any e-mail addresses found in the message body |
249 |
return 0 unless $self->_parse_body($pms); |
250 |
|
251 |
my (@emails) = keys %{$pms->{hashbl_email_cache}{'body'}}; |
252 |
|
253 |
# Randomize order and truncate the array to 10 items maximum |
254 |
Mail::SpamAssassin::Util::fisher_yates_shuffle(\@emails); |
255 |
$#emails = 9 if (scalar @emails > 10); |
256 |
|
257 |
foreach my $email (@emails) { |
258 |
#$self->_got_hit($pms, $email, "Email found in list $list"); |
259 |
dbg("BODY: $email"); |
260 |
$self->_submit_email_query($pms, $list, (($type) ? $type : 'SHA1'), $email); |
261 |
} |
262 |
|
263 |
return 0; |
264 |
} |
265 |
|
266 |
1; |