Line 0
Link Here
|
|
|
1 |
# <@LICENSE> |
2 |
# Licensed to the Apache Software Foundation (ASF) under one or more |
3 |
# contributor license agreements. See the NOTICE file distributed with |
4 |
# this work for additional information regarding copyright ownership. |
5 |
# The ASF licenses this file to you under the Apache License, Version 2.0 |
6 |
# (the "License"); you may not use this file except in compliance with |
7 |
# the License. You may obtain a copy of the License at: |
8 |
# |
9 |
# http://www.apache.org/licenses/LICENSE-2.0 |
10 |
# |
11 |
# Unless required by applicable law or agreed to in writing, software |
12 |
# distributed under the License is distributed on an "AS IS" BASIS, |
13 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 |
# See the License for the specific language governing permissions and |
15 |
# limitations under the License. |
16 |
# </@LICENSE> |
17 |
|
18 |
=head1 NAME |
19 |
|
20 |
Mail::SpamAssassin::BayesStore::BDB - BerkeleyDB Bayesian Storage Module Implementation |
21 |
|
22 |
=head1 SYNOPSIS |
23 |
|
24 |
=head1 DESCRIPTION |
25 |
|
26 |
This module implementes a BDB based bayesian storage module. |
27 |
|
28 |
=cut |
29 |
|
30 |
package Mail::SpamAssassin::BayesStore::BDB; |
31 |
|
32 |
use strict; |
33 |
use warnings; |
34 |
use bytes; |
35 |
use re 'taint'; |
36 |
use Errno qw(EBADF); |
37 |
use Data::Dumper; |
38 |
use Digest::SHA1 qw{sha1}; |
39 |
use File::Basename; |
40 |
use File::Path; |
41 |
|
42 |
use Mail::SpamAssassin::BayesStore; |
43 |
use Mail::SpamAssassin::Logger; |
44 |
|
45 |
use vars qw( @ISA ); |
46 |
|
47 |
@ISA = qw( Mail::SpamAssassin::BayesStore ); |
48 |
|
49 |
use constant HAS_BDB => eval { require BerkeleyDB; BerkeleyDB->import; }; |
50 |
|
51 |
my $rmw = DB_RMW; |
52 |
my $next = DB_NEXT; |
53 |
|
54 |
=head1 METHODS |
55 |
|
56 |
=head2 new |
57 |
|
58 |
public class (Mail::SpamAssassin::BayesStore::SQL) new (Mail::Spamassassin::Plugin::Bayes $bayes) |
59 |
|
60 |
Description: |
61 |
This methods creates a new instance of the Mail::SpamAssassin::BayesStore::BDB |
62 |
object. It expects to be passed an instance of the Mail::SpamAssassin:Bayes |
63 |
object which is passed into the Mail::SpamAssassin::BayesStore parent object. |
64 |
|
65 |
=cut |
66 |
|
67 |
sub new { |
68 |
my $class = shift; |
69 |
$class = ref($class) || $class; |
70 |
my $self = $class->SUPER::new(@_); |
71 |
$self->{supported_db_version} = 3; |
72 |
$self->{already_tied} = 0; |
73 |
$self->{is_locked} = 0; |
74 |
return $self; |
75 |
} |
76 |
|
77 |
=head2 tie_db_readonly |
78 |
|
79 |
public instance (Boolean) tie_db_readonly (); |
80 |
|
81 |
Description: |
82 |
This method ensures that the database connection is properly setup and |
83 |
working. It takes 'read-only' very seriously, and will not try to |
84 |
initialize anything. |
85 |
|
86 |
=cut |
87 |
|
88 |
sub tie_db_readonly { |
89 |
my($self) = @_; |
90 |
#dbg("BDB: tie_db_readonly"); |
91 |
my $result = ($self->{already_tied} and $self->{is_locked} == 0) || $self->_tie_db(0); |
92 |
#dbg("BDB: tie_db_readonly result is $result"); |
93 |
return $result; |
94 |
} |
95 |
|
96 |
=head2 tie_db_writable |
97 |
|
98 |
public instance (Boolean) tie_db_writable () |
99 |
|
100 |
Description: |
101 |
This method ensures that the database connetion is properly setup and |
102 |
working. If necessary it will initialize the database so that they can |
103 |
begin using the database immediately. |
104 |
|
105 |
=cut |
106 |
|
107 |
sub tie_db_writable { |
108 |
my($self) = @_; |
109 |
#dbg("BDB: tie_db_writable"); |
110 |
my $result = ($self->{already_tied} and $self->{is_locked} == 1) || $self->_tie_db(1); |
111 |
#dbg("BDB: tie_db_writable result is $result"); |
112 |
return $result; |
113 |
} |
114 |
|
115 |
=head2 _tie_db |
116 |
|
117 |
private instance (Boolean) _tie_db (Boolean $writeable) |
118 |
|
119 |
Description: |
120 |
This method ensures that the database connetion is properly setup and |
121 |
working. If it will initialize a users bayes variables so that they |
122 |
can begin using the database immediately. |
123 |
|
124 |
=cut |
125 |
|
126 |
sub _tie_db { |
127 |
my($self, $writeable) = @_; |
128 |
|
129 |
#dbg("BDB: _tie_db($writeable)"); |
130 |
|
131 |
# Always notice state changes |
132 |
$self->{is_locked} = $writeable; |
133 |
|
134 |
return 1 if($self->{already_tied}); |
135 |
|
136 |
#dbg("BDB: not already tied"); |
137 |
|
138 |
my $main = $self->{bayes}->{main}; |
139 |
|
140 |
if (!defined($main->{conf}->{bayes_path})) { |
141 |
#dbg("BDB: bayes_path not defined"); |
142 |
return 0; |
143 |
} |
144 |
|
145 |
#dbg("BDB: Reading db configs"); |
146 |
$self->read_db_configs(); |
147 |
|
148 |
my $path = dirname $main->sed_path($main->{conf}->{bayes_path}); |
149 |
|
150 |
#dbg("BDB: Path is $path"); |
151 |
# Path must exist or we must be in writeable mode |
152 |
if (-d $path) { |
153 |
# All is cool |
154 |
} elsif ($writeable) { |
155 |
# Create the path |
156 |
eval { |
157 |
mkpath($path, 0, (oct($main->{conf}->{bayes_file_mode}) & 0777)); |
158 |
}; |
159 |
warn("BDB: Couldn't create path: $@") if ($@); |
160 |
} else { |
161 |
# FAIL |
162 |
warn("BDB: bayes_path doesn't exist and can't create: $path"); |
163 |
return 0; |
164 |
} |
165 |
|
166 |
# Now we can set up our environment |
167 |
my $flags = DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN; |
168 |
$flags |= DB_CREATE if($writeable); |
169 |
# DB_REGISTER|DB_RECOVER| |
170 |
|
171 |
#dbg("BDB: Creating environment: $path, $flags, $main->{conf}->{bayes_file_mode}"); |
172 |
unless ($self->{env} = BerkeleyDB::Env->new(-Cachesize => 67108864, -Home => $path, -Flags => $flags, -Mode =>(oct($main->{conf}->{bayes_file_mode}) & 0666), -SetFlags => DB_LOG_AUTOREMOVE)) { |
173 |
#dbg("BDB: berkeleydb environment couldn't initialize: $BerkeleyDB::Error"); |
174 |
return 0; |
175 |
} |
176 |
|
177 |
$flags = $writeable ? DB_CREATE : 0; |
178 |
|
179 |
#dbg("BDB: Opening vars"); |
180 |
unless ($self->{handles}->{vars} = BerkeleyDB::Btree->new(-Env => $self->{env}, -Filename => "vars.db", -Flags => $flags)) { |
181 |
warn("BDB: couldn't open vars.db: $BerkeleyDB::Error"); |
182 |
$self->untie_db; |
183 |
return 0; |
184 |
} |
185 |
|
186 |
#dbg("BDB: Looking for db_version"); |
187 |
unless ($self->{db_version} = $self->_get(vars => "DB_VERSION")) { |
188 |
if ($writeable) { |
189 |
$self->{db_version} = $self->DB_VERSION; |
190 |
$self->{handles}->{vars}->db_put(DB_VERSION => $self->{db_version}) and die "Couldn't put record: $BerkeleyDB::Error"; |
191 |
$self->{handles}->{vars}->db_put(NTOKENS => 0) and die "Couldn't put record: $BerkeleyDB::Error"; |
192 |
#dbg("BDB: new db, set db version " . $self->{db_version} . " and 0 tokens"); |
193 |
} else { |
194 |
warn("BDB: vars.db not intialized: $BerkeleyDB::Error"); |
195 |
$self->untie_db; |
196 |
return 0; |
197 |
} |
198 |
} elsif ($self->{db_version}) { |
199 |
#dbg("BDB: found bayes db version $self->{db_version}"); |
200 |
if ($self->{db_version} != $self->DB_VERSION) { |
201 |
warn("BDB: bayes db version $self->{db_version} is not able to be used, aborting: $BerkeleyDB::Error"); |
202 |
$self->untie_db(); |
203 |
return 0; |
204 |
} |
205 |
} |
206 |
|
207 |
#dbg("BDB: Opening tokens"); |
208 |
unless ($self->{handles}->{tokens} = BerkeleyDB::Btree->new(-Env => $self->{env}, -Filename => "tokens.db", -Flags => $flags, -Property => DB_REVSPLITOFF)) { |
209 |
warn("BDB: couldn't open tokens.db: $BerkeleyDB::Error"); |
210 |
$self->untie_db; |
211 |
return 0; |
212 |
} |
213 |
|
214 |
#dbg("BDB: Opening atime secondary DB"); |
215 |
unless ($self->{handles}->{atime} = BerkeleyDB::Btree->new(-Env => $self->{env}, -Filename => "atime.db", -Flags => $flags, -Property => DB_DUP|DB_DUPSORT)) { |
216 |
warn("BDB: couldn't open atime.db: $BerkeleyDB::Error"); |
217 |
$self->untie_db; |
218 |
return 0; |
219 |
} |
220 |
|
221 |
#dbg("BDB: Opening seen DB"); |
222 |
unless ($self->{handles}->{seen} = BerkeleyDB::Btree->new(-Env => $self->{env}, -Filename => "seen.db", -Flags => $flags)) { |
223 |
warn("BDB: couldn't open tokens.db: $BerkeleyDB::Error"); |
224 |
$self->untie_db; |
225 |
return 0; |
226 |
} |
227 |
|
228 |
# This MUST be outside the transaction that opens the DB, or it just doesn't work. Dunno Why. |
229 |
$self->{handles}->{tokens}->associate($self->{handles}->{atime}, \&_extract_atime) and die "Couldn't associate DBs: $BerkeleyDB::Error"; |
230 |
|
231 |
$self->{already_tied} = 1; |
232 |
|
233 |
return 1; |
234 |
} |
235 |
|
236 |
=head2 untie_db |
237 |
|
238 |
public instance () untie_db () |
239 |
|
240 |
Description: |
241 |
Closes any open db handles. You can safely call this at any time. |
242 |
|
243 |
=cut |
244 |
|
245 |
sub untie_db { |
246 |
my $self = shift; |
247 |
|
248 |
$self->{is_locked} = 0; |
249 |
$self->{already_tied} = 0; |
250 |
$self->{db_version} = undef; |
251 |
|
252 |
for my $handle (keys %{$self->{handles}}) { |
253 |
# Since we are using transactions, this should be fine |
254 |
$self->{handles}->{$handle}->db_close (DB_NOSYNC); |
255 |
delete $self->{handles}->{$handle}; |
256 |
} |
257 |
|
258 |
$self->{env}->txn_checkpoint (128, 1) if $self->{env}; |
259 |
|
260 |
delete $self->{env}; |
261 |
return undef; |
262 |
} |
263 |
|
264 |
=head2 calculate_expire_delta |
265 |
|
266 |
public instance (%) calculate_expire_delta (Integer $newest_atime, |
267 |
Integer $start, |
268 |
Integer $max_expire_mult) |
269 |
|
270 |
Description: |
271 |
This method performs a calculation on the data to determine the |
272 |
optimum atime for token expiration. |
273 |
|
274 |
=cut |
275 |
|
276 |
sub calculate_expire_delta { |
277 |
#dbg("BDB: calculate_expire_delta starting"); |
278 |
my($self, $newest_atime, $start, $max_expire_mult) = @_; |
279 |
|
280 |
my %delta; # use a hash since an array is going to be very sparse |
281 |
|
282 |
my $cursor = $self->{handles}->{atime}->db_cursor or die "Couldn't get cursor: $BerkeleyDB::Error"; |
283 |
|
284 |
my($atime, $value) = ("", ""); |
285 |
|
286 |
# Do the first pass, figure out atime delta by iterating over our |
287 |
# *secondary* index, avoiding the decoding overhead |
288 |
while ($cursor->c_get($atime, $value, $next) == 0) { |
289 |
|
290 |
# Go through from $start * 1 to $start * 512, mark how many tokens we would expire |
291 |
my $age = $newest_atime - $atime; |
292 |
for (my $i = 1; $i <= $max_expire_mult; $i <<= 1) { |
293 |
if ($age >= $start * $i) { |
294 |
$delta{$i}++; |
295 |
} else { |
296 |
# If the token age is less than the expire delta, it'll be |
297 |
# less for all upcoming checks too, so abort early. |
298 |
last; |
299 |
} |
300 |
} |
301 |
} |
302 |
|
303 |
$cursor->c_close and die "Couldn't close cursor: $BerkeleyDB::Error"; |
304 |
undef $cursor; |
305 |
|
306 |
#dbg("BDB: calculate_expire_delta done"); |
307 |
return %delta; |
308 |
} |
309 |
|
310 |
=head2 token_expiration |
311 |
|
312 |
public instance (Integer, Integer, |
313 |
Integer, Integer) token_expiration (\% $opts, |
314 |
Integer $newdelta, |
315 |
@ @vars) |
316 |
|
317 |
Description: |
318 |
This method performs the database specific expiration of tokens based on |
319 |
the passed in C<$newdelta> and C<@vars>. |
320 |
|
321 |
=cut |
322 |
|
323 |
sub token_expiration { |
324 |
#dbg("BDB: Entering token_expiration"); |
325 |
my($self, $opts, $newdelta, @vars) = @_; |
326 |
|
327 |
my($kept, $deleted, $hapaxes, $lowfreq) = (0, 0, 0, 0); |
328 |
|
329 |
# Reset stray too-new tokens |
330 |
{ |
331 |
my $cursor = $self->{handles}->{atime}->db_cursor or die "Couldn't get cursor: $BerkeleyDB::Error"; |
332 |
|
333 |
# Grab the token for a tight RWM loop |
334 |
my($atime, $flag) = ($vars[10], DB_SET_RANGE|$rmw); |
335 |
# Find the first token eq or gt the current newest |
336 |
while ($cursor->c_pget($atime, my $token, my $value, $flag) == 0) { |
337 |
my($ts, $th, $current) = _unpack_token($value); |
338 |
$self->{handles}->{tokens}->db_put($token, _pack_token($ts, $th, $atime)) and die "Couldn't put record: $BerkeleyDB::Error"; |
339 |
$flag = $next|$rmw; # We need to adjust our flag to continue on from the first rec |
340 |
} |
341 |
|
342 |
$cursor->c_close and die "Couldn't close cursor: $BerkeleyDB::Error"; |
343 |
undef $cursor; |
344 |
} |
345 |
|
346 |
# Figure out how old is too old... |
347 |
my $too_old = $vars[10] - $newdelta; # tooold = newest - delta |
348 |
#dbg("BDB: Too old is $too_old"); |
349 |
|
350 |
#dbg("BDB: Getting db stats"); |
351 |
my $count; |
352 |
|
353 |
# Estimate the number of keys to be deleted |
354 |
{ |
355 |
my $stats = $self->{handles}->{atime}->db_stat(DB_FAST_STAT); |
356 |
#dbg("DBD: Stats: " . Dumper $stats); |
357 |
# Scan if we've never gotten stats before |
358 |
$stats = $self->{handles}->{atime}->db_stat if($stats->{bt_ndata} == 0); |
359 |
#dbg("DBD: Stats: " . Dumper $stats); |
360 |
if ($self->{handles}->{atime}->db_key_range($too_old, my $less, my $equal, my $greater) == 0) { |
361 |
#dbg("DBD: less is $less, equal is $equal, greater is $greater"); |
362 |
$count = $stats->{bt_ndata} - $stats->{bt_ndata} * $greater; |
363 |
} |
364 |
} |
365 |
|
366 |
#dbg("BDB: Considering deleting $vars[3], $count"); |
367 |
|
368 |
# As long as too many tokens wouldn't be deleted |
369 |
if ($vars[3] - $count >= 100000) { |
370 |
|
371 |
#dbg("BDB: Preparing to iterate"); |
372 |
|
373 |
my $cursor = $self->{handles}->{atime}->db_cursor or die "Couldn't get cursor: $BerkeleyDB::Error"; |
374 |
|
375 |
my ($atime, $oldest, $token, $value); |
376 |
|
377 |
$atime = 0; |
378 |
|
379 |
while ($cursor->c_pget($atime, $token, $value, $next) == 0) { |
380 |
# We're traversing in order, so done |
381 |
$oldest = $atime, last if($atime >= $too_old); |
382 |
#dbg("BDB: Deleting record"); |
383 |
$cursor->c_del; |
384 |
$deleted++; |
385 |
my($ts, $th, $atime) = _unpack_token($value); |
386 |
if ($ts + $th == 1) { |
387 |
$hapaxes++; |
388 |
} elsif ($ts < 8 && $th < 8) { |
389 |
$lowfreq++; |
390 |
} |
391 |
} |
392 |
|
393 |
#dbg("BDB: Done with cursor"); |
394 |
$cursor->c_close and die "Couldn't close cursor: $BerkeleyDB::Error"; |
395 |
undef $cursor; |
396 |
|
397 |
$kept = $self->_get (vars => "NTOKENS", $rmw) - $deleted; |
398 |
$self->{handles}->{vars}->db_put(NTOKENS => $kept) and die "Couldn't put record: $BerkeleyDB::Error"; |
399 |
$self->{handles}->{vars}->db_put(LAST_EXPIRE => time) and die "Couldn't put record: $BerkeleyDB::Error"; |
400 |
$self->{handles}->{vars}->db_put(OLDEST_TOKEN_AGE => $oldest) and die "Couldn't put record: $BerkeleyDB::Error"; |
401 |
$self->{handles}->{vars}->db_put(LAST_EXPIRE_REDUCE => $deleted) and die "Couldn't put record: $BerkeleyDB::Error"; |
402 |
$self->{handles}->{vars}->db_put(LAST_ATIME_DELTA => $newdelta) and die "Couldn't put record: $BerkeleyDB::Error"; |
403 |
|
404 |
#$self->{handles}->{atime}->compact; |
405 |
#$self->{handles}->{tokens}->compact; |
406 |
#$self->{handles}->{vars}->compact; |
407 |
|
408 |
} else { |
409 |
#dbg("BDB: Update vars to regenerate histogram"); |
410 |
# Make sure we regenerate our histogramn |
411 |
$kept = $self->_get(vars => "NTOKENS"); |
412 |
$self->{handles}->{vars}->db_put(LAST_EXPIRE => time) and die "Couldn't put record: $BerkeleyDB::Error"; |
413 |
$self->{handles}->{vars}->db_put(LAST_ATIME_DELTA => 0) and die "Couldn't put record: $BerkeleyDB::Error"; |
414 |
$self->{handles}->{vars}->db_put(LAST_EXPIRE_REDUCE => 0) and die "Couldn't put record: $BerkeleyDB::Error"; |
415 |
} |
416 |
|
417 |
#dbg("BDB: token_expiration done"); |
418 |
return($kept, $deleted, $hapaxes, $lowfreq); |
419 |
} |
420 |
|
421 |
=head2 sync_due |
422 |
|
423 |
public instance (Boolean) sync_due () |
424 |
|
425 |
Description: |
426 |
This method determines if a database sync is currently required. |
427 |
|
428 |
Unused for BDB implementation. |
429 |
|
430 |
=cut |
431 |
|
432 |
sub sync_due { |
433 |
return 0; |
434 |
} |
435 |
|
436 |
=head2 seen_get |
437 |
|
438 |
public instance (String) seen_get (string $msgid) |
439 |
|
440 |
Description: |
441 |
This method retrieves the stored value, if any, for C<$msgid>. The return value |
442 |
is the stored string ('s' for spam and 'h' for ham) or undef if C<$msgid> is not |
443 |
found. |
444 |
|
445 |
=cut |
446 |
|
447 |
sub seen_get { |
448 |
#dbg("BDB: Entering seen_get"); |
449 |
my($self, $msgid) = @_; |
450 |
|
451 |
my $value = $self->_get(seen => $msgid); |
452 |
|
453 |
return $value; |
454 |
} |
455 |
|
456 |
=head2 seen_put |
457 |
|
458 |
public (Boolean) seen_put (string $msgid, char $flag) |
459 |
|
460 |
Description: |
461 |
This method records C<$msgid> as the type given by C<$flag>. C<$flag> is one of |
462 |
two values 's' for spam and 'h' for ham. |
463 |
|
464 |
=cut |
465 |
|
466 |
sub seen_put { |
467 |
#dbg("BDB: Entering seen_put"); |
468 |
my($self, $msgid, $flag) = @_; |
469 |
|
470 |
$self->{handles}->{seen}->db_put($msgid, $flag) and die "Couldn't put record: $BerkeleyDB::Error"; |
471 |
|
472 |
return 1; |
473 |
} |
474 |
|
475 |
=head2 seen_delete |
476 |
|
477 |
public instance (Boolean) seen_delete (string $msgid) |
478 |
|
479 |
Description: |
480 |
This method removes C<$msgid> from the database. |
481 |
|
482 |
=cut |
483 |
|
484 |
sub seen_delete { |
485 |
#dbg("BDB: Entering seen_delete"); |
486 |
my($self, $msgid) = @_; |
487 |
|
488 |
my $result; |
489 |
|
490 |
my $status = $self->{handles}->{seen}->db_del($msgid); |
491 |
|
492 |
if ($status == 0) { |
493 |
$result = 1; |
494 |
} elsif ($status == DB_NOTFOUND) { |
495 |
$result = 0E0; |
496 |
} else { |
497 |
die "Couldn't delete record: $BerkeleyDB::Error"; |
498 |
} |
499 |
|
500 |
return $result; |
501 |
} |
502 |
|
503 |
=head2 get_storage_variables |
504 |
|
505 |
public instance (@) get_storage_variables () |
506 |
|
507 |
Description: |
508 |
This method retrieves the various administrative variables used by |
509 |
the Bayes process and database. |
510 |
|
511 |
The values returned in the array are in the following order: |
512 |
|
513 |
0: scan count base |
514 |
|
515 |
1: number of spam |
516 |
|
517 |
2: number of ham |
518 |
|
519 |
3: number of tokens in db |
520 |
|
521 |
4: last expire atime |
522 |
|
523 |
5: oldest token in db atime |
524 |
|
525 |
6: db version value |
526 |
|
527 |
7: last journal sync |
528 |
|
529 |
8: last atime delta |
530 |
|
531 |
9: last expire reduction count |
532 |
|
533 |
10: newest token in db atime |
534 |
|
535 |
=cut |
536 |
|
537 |
sub get_storage_variables { |
538 |
#dbg("BDB: get_storage_variables starting"); |
539 |
my($self) = @_; |
540 |
|
541 |
my @values; |
542 |
for my $token (qw{LAST_JOURNAL_SYNC NSPAM NHAM NTOKENS LAST_EXPIRE OLDEST_TOKEN_AGE DB_VERSION LAST_JOURNAL_SYNC LAST_ATIME_DELTA LAST_EXPIRE_REDUCE NEWEST_TOKEN_AGE}) { |
543 |
my $value = $self->_get (vars => $token); |
544 |
$value = 0 unless($value and $value =~ /\d+/); |
545 |
push @values, $value; |
546 |
} |
547 |
|
548 |
#dbg("BDB: get_storage_variables done"); |
549 |
return @values; |
550 |
} |
551 |
|
552 |
=head2 dump_tokens |
553 |
|
554 |
public instance () dump_tokens (String $template, String $regex, Array @vars) |
555 |
|
556 |
Description: |
557 |
This method loops over all tokens, computing the probability for the token and then |
558 |
printing it out according to the passed in token. |
559 |
|
560 |
=cut |
561 |
|
562 |
sub dump_tokens { |
563 |
#dbg("BDB: dump_tokens starting"); |
564 |
my($self, $template, $regex, @vars) = @_; |
565 |
|
566 |
my $cursor = $self->{handles}->{tokens}->db_cursor or die "Couldn't get cursor: $BerkeleyDB::Error"; |
567 |
my ($token, $value) = ("", ""); |
568 |
while ($cursor->c_get($token, $value, $next) == 0) { |
569 |
next if(defined $regex && ($token !~ /$regex/o)); |
570 |
my($ts, $th, $atime) = _unpack_token($value); |
571 |
my $prob = $self->{bayes}->compute_prob_for_token($token, $vars[1], $vars[2], $ts, $th) || 0.5; |
572 |
my $encoded = unpack("H*",$token); |
573 |
printf $template, $prob, $ts, $th, $atime, $encoded; |
574 |
} |
575 |
|
576 |
$cursor->c_close and die "Couldn't close cursor: $BerkeleyDB::Error"; |
577 |
undef $cursor; |
578 |
|
579 |
#dbg("BDB: dump_tokens done"); |
580 |
return 1; |
581 |
} |
582 |
|
583 |
=head2 set_last_expire |
584 |
|
585 |
public instance (Boolean) set_last_expire (Integer $time) |
586 |
|
587 |
Description: |
588 |
This method sets the last expire time. |
589 |
|
590 |
=cut |
591 |
|
592 |
sub set_last_expire { |
593 |
#dbg("BDB: Entering set_last_expire"); |
594 |
my($self, $time) = @_; |
595 |
$self->{handles}->{vars}->db_put(LAST_EXPIRE => $time) and die "Couldn't put record: $BerkeleyDB::Error"; |
596 |
return 1; |
597 |
} |
598 |
|
599 |
=head2 get_running_expire_tok |
600 |
|
601 |
public instance (String $time) get_running_expire_tok () |
602 |
|
603 |
Description: |
604 |
This method determines if an expire is currently running and returns |
605 |
the last time set. |
606 |
|
607 |
There can be multiple times, so we just pull the greatest (most recent) |
608 |
value. |
609 |
|
610 |
=cut |
611 |
|
612 |
sub get_running_expire_tok { |
613 |
#dbg("BDB: Entering get_running_expire_tok"); |
614 |
my($self) = @_; |
615 |
|
616 |
my $value = $self->_get (vars => "RUNNING_EXPIRE") || ""; |
617 |
my $result = $value if $value =~ /^\d+$/; |
618 |
|
619 |
#dbg("BDB: get_running_expire_tok exiting with $result"); |
620 |
return $result; |
621 |
} |
622 |
|
623 |
=head2 set_running_expire_tok |
624 |
|
625 |
public instance (String $time) set_running_expire_tok () |
626 |
|
627 |
Description: |
628 |
This method sets the time that an expire starts running. |
629 |
|
630 |
=cut |
631 |
|
632 |
sub set_running_expire_tok { |
633 |
my($self) = @_; |
634 |
|
635 |
my $time = time; |
636 |
$self->{handles}->{vars}->db_put(RUNNING_EXPIRE => $time) and die "Couldn't put record: $BerkeleyDB::Error"; |
637 |
|
638 |
return $time; |
639 |
} |
640 |
|
641 |
=head2 remove_running_expire_tok |
642 |
|
643 |
public instance (Boolean) remove_running_expire_tok () |
644 |
|
645 |
Description: |
646 |
This method removes the row in the database that indicates that |
647 |
and expire is currently running. |
648 |
|
649 |
=cut |
650 |
|
651 |
sub remove_running_expire_tok { |
652 |
my($self) = @_; |
653 |
|
654 |
my $status = $self->{handles}->{vars}->db_del("RUNNING_EXPIRE"); |
655 |
|
656 |
my $result; |
657 |
|
658 |
if ($status == 0) { |
659 |
$result = 1; |
660 |
} elsif ($status == DB_NOTFOUND) { |
661 |
$result = 0E0; |
662 |
} else { |
663 |
die "Couldn't delete record: $BerkeleyDB::Error"; |
664 |
} |
665 |
|
666 |
return $result; |
667 |
} |
668 |
|
669 |
=head2 tok_get |
670 |
|
671 |
public instance (Integer, Integer, Integer) tok_get (String $token) |
672 |
|
673 |
Description: |
674 |
This method retrieves a specificed token (C<$token>) from the database |
675 |
and returns it's spam_count, ham_count and last access time. |
676 |
|
677 |
=cut |
678 |
|
679 |
sub tok_get { |
680 |
#dbg("BDB: Entering tok_get"); |
681 |
my($self, $token) = @_; |
682 |
my $array = $self->tok_get_all ([$token]); |
683 |
return @{$array->[0]}; |
684 |
} |
685 |
|
686 |
=head2 tok_get_all |
687 |
|
688 |
public instance (\@) tok_get (@ $tokens) |
689 |
|
690 |
Description: |
691 |
This method retrieves the specified tokens (C<$tokens>) from storage and returns |
692 |
an array ref of arrays spam count, ham acount and last access time. |
693 |
|
694 |
=cut |
695 |
|
696 |
sub tok_get_all { |
697 |
#dbg("BDB: Entering tok_get_all"); |
698 |
my($self, @keys) = @_; |
699 |
|
700 |
my @values; |
701 |
for my $token (@keys) { |
702 |
if (my $value = $self->_get(seen => $token)) { |
703 |
push(@values, [$token, _unpack_token($value)]); |
704 |
} |
705 |
} |
706 |
|
707 |
# #dbg("BDB: tok_get_all returning with " . Dump \@values); |
708 |
return \@values; |
709 |
} |
710 |
|
711 |
=head2 tok_count_change |
712 |
|
713 |
public instance (Boolean) tok_count_change (Integer $dspam, |
714 |
Integer $dham, |
715 |
String $token, |
716 |
String $newatime) |
717 |
|
718 |
Description: |
719 |
This method takes a C<$spam_count> and C<$ham_count> and adds it to |
720 |
C<$tok> along with updating C<$tok>s atime with C<$atime>. |
721 |
|
722 |
=cut |
723 |
|
724 |
sub tok_count_change { |
725 |
#dbg("BDB: Entering tok_count_change"); |
726 |
my($self, $dspam, $dham, $token, $newatime) = @_; |
727 |
$self->multi_tok_count_change ($dspam, $dham, {$token => 1}, $newatime); |
728 |
} |
729 |
|
730 |
=head2 multi_tok_count_change |
731 |
|
732 |
public instance (Boolean) multi_tok_count_change (Integer $dspam, |
733 |
Integer $dham, |
734 |
\% $tokens, |
735 |
String $newatime) |
736 |
|
737 |
Description: |
738 |
This method takes a C<$dspam> and C<$dham> and adds it to all of the |
739 |
tokens in the C<$tokens> hash ref along with updating each tokens |
740 |
atime with C<$atime>. |
741 |
|
742 |
=cut |
743 |
|
744 |
sub multi_tok_count_change { |
745 |
my($self, $dspam, $dham, $tokens, $newatime) = @_; |
746 |
|
747 |
# Make sure we have some values |
748 |
$dspam ||= 0; |
749 |
$dham ||= 0; |
750 |
$newatime ||= 0; |
751 |
|
752 |
# No changes, just return |
753 |
return 1 unless ($dspam or $dham); |
754 |
|
755 |
# Collect this for updates at the end |
756 |
my $newtokens = 0; |
757 |
|
758 |
for my $token (keys %{$tokens}) { |
759 |
my $status = $self->{handles}->{tokens}->db_get($token => my $value, $rmw); |
760 |
|
761 |
if ($status == 0) { |
762 |
my ($spam, $ham, $oldatime) = _unpack_token ($value); |
763 |
$spam += $dspam; |
764 |
$spam = 0 if ($spam < 0); |
765 |
$ham += $dham; |
766 |
$ham = 0 if ($ham < 0); |
767 |
my $newvalue = _pack_token($spam, $ham, $newatime); |
768 |
$self->{handles}->{tokens}->db_put($token => $newvalue) and die "Couldn't put record: $BerkeleyDB::Error"; |
769 |
} |
770 |
|
771 |
elsif ($status == DB_NOTFOUND) { |
772 |
my $spam = $dspam; |
773 |
$spam = 0 if ($spam < 0); |
774 |
my $ham = $dham; |
775 |
$ham = 0 if ($ham < 0); |
776 |
my $newvalue = _pack_token($spam, $ham, $newatime); |
777 |
$self->{handles}->{tokens}->db_put($token => $newvalue) and die "Couldn't put record: $BerkeleyDB::Error"; |
778 |
$newtokens++; |
779 |
} |
780 |
|
781 |
else { |
782 |
die "Couldn't get record: $BerkeleyDB::Error"; |
783 |
} |
784 |
} |
785 |
|
786 |
if ($newtokens) { |
787 |
my $ntokens = $self->_get(vars => "NTOKENS", $rmw) || 0; |
788 |
$ntokens += $newtokens; |
789 |
$ntokens = 0 if ($ntokens < 0); |
790 |
$self->{handles}->{vars}->db_put(NTOKENS => $ntokens) and die "Couldn't put record: $BerkeleyDB::Error"; |
791 |
} |
792 |
|
793 |
my $newmagic = $self->_get(vars => "NEWEST_TOKEN_AGE", $rmw) || 0; |
794 |
if ($newatime > $newmagic) { |
795 |
$self->{handles}->{vars}->db_put(NEWEST_TOKEN_AGE => $newatime) and die "Couldn't put record: $BerkeleyDB::Error"; |
796 |
} |
797 |
|
798 |
my $oldmagic = $self->_get(vars => "OLDEST_TOKEN_AGE", $rmw) || time; |
799 |
if ($newatime and $newatime < $oldmagic) { |
800 |
$self->{handles}->{vars}->db_put(OLDEST_TOKEN_AGE => $newatime) and die "Couldn't put record: $BerkeleyDB::Error"; |
801 |
} |
802 |
|
803 |
return 1; |
804 |
} |
805 |
|
806 |
=head2 nspam_nham_get |
807 |
|
808 |
public instance ($spam_count, $ham_count) nspam_nham_get () |
809 |
|
810 |
Description: |
811 |
This method retrieves the total number of spam and the total number of |
812 |
ham learned. |
813 |
|
814 |
=cut |
815 |
|
816 |
sub nspam_nham_get { |
817 |
#dbg("BDB: Entering nspam_nham_get"); |
818 |
my($self) = @_; |
819 |
my @vars = $self->get_storage_variables(); |
820 |
($vars[1], $vars[2]); |
821 |
} |
822 |
|
823 |
=head2 nspam_nham_change |
824 |
|
825 |
public instance (Boolean) nspam_nham_change (Integer $num_spam, |
826 |
Integer $num_ham) |
827 |
|
828 |
Description: |
829 |
This method updates the number of spam and the number of ham in the database. |
830 |
|
831 |
=cut |
832 |
|
833 |
sub nspam_nham_change { |
834 |
my($self, $ds, $dh) = @_; |
835 |
|
836 |
my $nspam = $self->_get(vars => "NSPAM", $rmw) || 0; |
837 |
$nspam += ($ds || 0); |
838 |
$nspam = 0 if ($nspam < 0); |
839 |
$self->{handles}->{vars}->db_put(NSPAM => $nspam) and die "Couldn't put record: $BerkeleyDB::Error"; |
840 |
|
841 |
my $nham = $self->_get(vars => "NHAM", $rmw) || 0; |
842 |
$nham += ($dh || 0); |
843 |
$nham = 0 if ($nham < 0); |
844 |
$self->{handles}->{vars}->db_put(NHAM => $nham) and die "Couldn't put record: $BerkeleyDB::Error"; |
845 |
|
846 |
return 1; |
847 |
} |
848 |
|
849 |
=head2 tok_touch |
850 |
|
851 |
public instance (Boolean) tok_touch (String $token, |
852 |
String $atime) |
853 |
|
854 |
Description: |
855 |
This method updates the given tokens (C<$token>) atime. |
856 |
|
857 |
The assumption is that the token already exists in the database. |
858 |
|
859 |
We will never update to an older atime |
860 |
|
861 |
=cut |
862 |
|
863 |
sub tok_touch { |
864 |
my($self, $token, $atime) = @_; |
865 |
return $self->tok_touch_all ([$token], $atime); |
866 |
} |
867 |
|
868 |
=head2 tok_touch_all |
869 |
|
870 |
public instance (Boolean) tok_touch (\@ $tokens |
871 |
String $atime) |
872 |
|
873 |
Description: |
874 |
This method does a mass update of the given list of tokens C<$tokens>, |
875 |
if the existing token atime is < C<$atime>. |
876 |
|
877 |
The assumption is that the tokens already exist in the database. |
878 |
|
879 |
We should never be touching more than N_SIGNIFICANT_TOKENS, so we can |
880 |
make some assumptions about how to handle the data (ie no need to |
881 |
batch like we do in tok_get_all) |
882 |
|
883 |
=cut |
884 |
|
885 |
sub tok_touch_all { |
886 |
my($self, $tokens, $newatime) = @_; |
887 |
|
888 |
for my $token (@{$tokens}) { |
889 |
my $status = $self->{handles}->{tokens}->db_get($token => my $value, $rmw); |
890 |
if ($status == 0) { |
891 |
my ($spam, $ham, $oldatime) = _unpack_token ($value); |
892 |
my $newvalue = _pack_token ($spam, $ham, $newatime); |
893 |
$self->{handles}->{tokens}->db_put($token => $newvalue) and die "Couldn't put record: $BerkeleyDB::Error"; |
894 |
} |
895 |
|
896 |
elsif ($status == DB_NOTFOUND) { |
897 |
# Do nothing |
898 |
} |
899 |
|
900 |
else { |
901 |
die "Couldn't get record: $BerkeleyDB::Error"; |
902 |
} |
903 |
} |
904 |
|
905 |
return 1; |
906 |
} |
907 |
|
908 |
=head2 cleanup |
909 |
|
910 |
public instance (Boolean) cleanup () |
911 |
|
912 |
Description: |
913 |
This method peroms any cleanup necessary before moving onto the next |
914 |
operation. |
915 |
|
916 |
=cut |
917 |
|
918 |
sub cleanup { |
919 |
my ($self) = @_; |
920 |
#dbg("Running cleanup"); |
921 |
return 1; |
922 |
} |
923 |
|
924 |
=head2 get_magic_re |
925 |
|
926 |
public instance (String) get_magic_re () |
927 |
|
928 |
Description: |
929 |
This method returns a regexp which indicates a magic token. |
930 |
|
931 |
Unused in BDB implementation. |
932 |
|
933 |
=cut |
934 |
|
935 |
use constant get_magic_re => undef; |
936 |
|
937 |
=head2 sync |
938 |
|
939 |
public instance (Boolean) sync (\% $opts) |
940 |
|
941 |
Description: |
942 |
This method performs a sync of the database |
943 |
|
944 |
=cut |
945 |
|
946 |
sub sync { |
947 |
my($self, $opts) = @_; |
948 |
#dbg("Running sync"); |
949 |
return 1; |
950 |
} |
951 |
|
952 |
=head2 perform_upgrade |
953 |
|
954 |
public instance (Boolean) perform_upgrade (\% $opts); |
955 |
|
956 |
Description: |
957 |
Performs an upgrade of the database from one version to another, not |
958 |
currently used in this implementation. |
959 |
|
960 |
=cut |
961 |
|
962 |
sub perform_upgrade { |
963 |
#dbg("BDB: Entering perform_upgrade"); |
964 |
return 1; |
965 |
} |
966 |
|
967 |
=head2 clear_database |
968 |
|
969 |
public instance (Boolean) clear_database () |
970 |
|
971 |
Description: |
972 |
This method deletes all records for a particular user. |
973 |
|
974 |
Callers should be aware that any errors returned by this method |
975 |
could causes the database to be inconsistent for the given user. |
976 |
|
977 |
=cut |
978 |
|
979 |
sub clear_database { |
980 |
#dbg("BDB: Entering clear_database"); |
981 |
my($self) = @_; |
982 |
|
983 |
$self->untie_db(); |
984 |
#dbg("BDB: removing db."); |
985 |
my $main = $self->{bayes}->{main}; |
986 |
my $path = $main->sed_path($main->{conf}->{bayes_path}); |
987 |
eval {rmpath($path)}; |
988 |
return 1; |
989 |
} |
990 |
|
991 |
=head2 backup_database |
992 |
|
993 |
public instance (Boolean) backup_database () |
994 |
|
995 |
Description: |
996 |
This method will dump the users database in a machine readable format. |
997 |
|
998 |
=cut |
999 |
|
1000 |
sub backup_database { |
1001 |
#dbg("BDB: Entering backup_database"); |
1002 |
my($self) = @_; |
1003 |
return 0 unless $self->tie_db_writable; |
1004 |
my @vars = $self->get_storage_variables; |
1005 |
|
1006 |
print "v\t$vars[6]\tdb_version # this must be the first line!!!\n"; |
1007 |
print "v\t$vars[1]\tnum_spam\n"; |
1008 |
print "v\t$vars[2]\tnum_nonspam\n"; |
1009 |
|
1010 |
my $tokens = $self->{handles}->{tokens}->db_cursor or die "Couldn't get cursor: $BerkeleyDB::Error"; |
1011 |
|
1012 |
my($token, $value) = ("", ""); |
1013 |
while ($tokens->c_get($token, $value, $next) == 0) { |
1014 |
my($ts, $th, $atime) = _unpack_token($value); |
1015 |
my $encoded = unpack("H*", $token); |
1016 |
print "t\t$ts\t$th\t$atime\t$encoded\n"; |
1017 |
} |
1018 |
|
1019 |
$tokens->c_close and die "Couldn't close cursor: $BerkeleyDB::Error"; |
1020 |
undef $tokens; |
1021 |
|
1022 |
my $seen = $self->{handles}->{seen}->db_cursor or die "Couldn't get cursor: $BerkeleyDB::Error"; |
1023 |
|
1024 |
$token = ""; |
1025 |
while ($seen->c_get($token, $value, $next) == 0) { |
1026 |
print "s\t$token\t$value\n"; |
1027 |
} |
1028 |
|
1029 |
$seen->c_close and die "Couldn't close cursor: $BerkeleyDB::Error"; |
1030 |
undef $seen; |
1031 |
|
1032 |
$self->untie_db(); |
1033 |
|
1034 |
return 1; |
1035 |
} |
1036 |
|
1037 |
=head2 restore_database |
1038 |
|
1039 |
public instance (Boolean) restore_database (String $filename, Boolean $showdots) |
1040 |
|
1041 |
Description: |
1042 |
This method restores a database from the given filename, C<$filename>. |
1043 |
|
1044 |
Callers should be aware that any errors returned by this method |
1045 |
could causes the database to be inconsistent for the given user. |
1046 |
|
1047 |
=cut |
1048 |
|
1049 |
sub restore_database { |
1050 |
#dbg("BDB: Entering restore_database"); |
1051 |
my ($self, $filename, $showdots) = @_; |
1052 |
|
1053 |
local *DUMPFILE; |
1054 |
if (!open(DUMPFILE, '<', $filename)) { |
1055 |
#dbg("BDB: unable to open backup file $filename: $!"); |
1056 |
return 0; |
1057 |
} |
1058 |
|
1059 |
# This is the critical phase (moving sql around), so don't allow it |
1060 |
# to be interrupted. |
1061 |
local $SIG{'INT'} = 'IGNORE'; |
1062 |
local $SIG{'HUP'} = 'IGNORE' if (!Mail::SpamAssassin::Util::am_running_on_windows()); |
1063 |
local $SIG{'TERM'} = 'IGNORE'; |
1064 |
|
1065 |
unless ($self->clear_database()) { |
1066 |
return 0; |
1067 |
} |
1068 |
|
1069 |
# we need to go ahead close the db connection so we can then open it up |
1070 |
# in a fresh state after clearing |
1071 |
$self->untie_db(); |
1072 |
|
1073 |
unless ($self->tie_db_writable()) { |
1074 |
return 0; |
1075 |
} |
1076 |
|
1077 |
my $token_count = 0; |
1078 |
my $db_version; |
1079 |
my $num_spam; |
1080 |
my $num_ham; |
1081 |
my $error_p = 0; |
1082 |
my $line_count = 0; |
1083 |
|
1084 |
my $line = <DUMPFILE>; |
1085 |
defined $line or die "Error reading dump file: $!"; |
1086 |
$line_count++; |
1087 |
# We require the database version line to be the first in the file so we can |
1088 |
# figure out how to properly deal with the file. If it is not the first |
1089 |
# line then fail |
1090 |
if ($line =~ m/^v\s+(\d+)\s+db_version/) { |
1091 |
$db_version = $1; |
1092 |
} else { |
1093 |
#dbg("BDB: database version must be the first line in the backup file, correct and re-run"); |
1094 |
return 0; |
1095 |
} |
1096 |
|
1097 |
unless ($db_version == 2 || $db_version == 3) { |
1098 |
warn("BDB: database version $db_version is unsupported, must be version 2 or 3"); |
1099 |
return 0; |
1100 |
} |
1101 |
|
1102 |
my $token_error_count = 0; |
1103 |
my $seen_error_count = 0; |
1104 |
|
1105 |
for ($!=0; defined($line=<DUMPFILE>); $!=0) { |
1106 |
chomp($line); |
1107 |
$line_count++; |
1108 |
|
1109 |
if ($line_count % 1000 == 0) { |
1110 |
print STDERR "." if ($showdots); |
1111 |
} |
1112 |
|
1113 |
if ($line =~ /^v\s+/) { # variable line |
1114 |
my @parsed_line = split(/\s+/, $line, 3); |
1115 |
my $value = $parsed_line[1] + 0; |
1116 |
if ($parsed_line[2] eq 'num_spam') { |
1117 |
$num_spam = $value; |
1118 |
} elsif ($parsed_line[2] eq 'num_nonspam') { |
1119 |
$num_ham = $value; |
1120 |
} else { |
1121 |
#dbg("BDB: restore_database: skipping unknown line: $line"); |
1122 |
} |
1123 |
} elsif ($line =~ /^t\s+/) { # token line |
1124 |
my @parsed_line = split(/\s+/, $line, 5); |
1125 |
my $spam_count = $parsed_line[1] + 0; |
1126 |
my $ham_count = $parsed_line[2] + 0; |
1127 |
my $atime = $parsed_line[3] + 0; |
1128 |
my $token = $parsed_line[4]; |
1129 |
|
1130 |
my $token_warn_p = 0; |
1131 |
my @warnings; |
1132 |
|
1133 |
if ($spam_count < 0) { |
1134 |
$spam_count = 0; |
1135 |
push(@warnings, 'spam count < 0, resetting'); |
1136 |
$token_warn_p = 1; |
1137 |
} |
1138 |
if ($ham_count < 0) { |
1139 |
$ham_count = 0; |
1140 |
push(@warnings, 'ham count < 0, resetting'); |
1141 |
$token_warn_p = 1; |
1142 |
} |
1143 |
|
1144 |
if ($spam_count == 0 && $ham_count == 0) { |
1145 |
#dbg("BDB: token has zero spam and ham count, skipping"); |
1146 |
next; |
1147 |
} |
1148 |
|
1149 |
if ($atime > time()) { |
1150 |
$atime = time(); |
1151 |
push(@warnings, 'atime > current time, resetting'); |
1152 |
$token_warn_p = 1; |
1153 |
} |
1154 |
|
1155 |
if ($token_warn_p) { |
1156 |
#dbg("BDB: token ($token) has the following warnings:\n".join("\n",@warnings)); |
1157 |
} |
1158 |
|
1159 |
if ($db_version < 3) { |
1160 |
# versions < 3 use plain text tokens, so we need to convert to hash |
1161 |
$token = substr(sha1($token), -5); |
1162 |
} else { |
1163 |
# turn unpacked binary token back into binary value |
1164 |
$token = pack("H*",$token); |
1165 |
} |
1166 |
|
1167 |
unless ($self->_put_token($token, $spam_count, $ham_count, $atime)) { |
1168 |
#dbg("BDB: error inserting token for line: $line"); |
1169 |
$token_error_count++; |
1170 |
} |
1171 |
$token_count++; |
1172 |
} elsif ($line =~ /^s\s+/) { # seen line |
1173 |
my @parsed_line = split(/\s+/, $line, 3); |
1174 |
my $flag = $parsed_line[1]; |
1175 |
my $msgid = $parsed_line[2]; |
1176 |
|
1177 |
unless ($flag eq 'h' || $flag eq 's') { |
1178 |
#dbg("BDB: unknown seen flag ($flag) for line: $line, skipping"); |
1179 |
next; |
1180 |
} |
1181 |
|
1182 |
unless ($msgid) { |
1183 |
#dbg("BDB: blank msgid for line: $line, skipping"); |
1184 |
next; |
1185 |
} |
1186 |
|
1187 |
unless ($self->seen_put($msgid, $flag)) { |
1188 |
#dbg("BDB: error inserting msgid in seen table for line: $line"); |
1189 |
$seen_error_count++; |
1190 |
} |
1191 |
} else { |
1192 |
#dbg("BDB: skipping unknown line: $line"); |
1193 |
next; |
1194 |
} |
1195 |
|
1196 |
if ($token_error_count >= 20) { |
1197 |
warn "BDB: encountered too many errors (20) while parsing token line, reverting to empty database and exiting\n"; |
1198 |
$self->clear_database(); |
1199 |
return 0; |
1200 |
} |
1201 |
|
1202 |
if ($seen_error_count >= 20) { |
1203 |
warn "BDB: encountered too many errors (20) while parsing seen lines, reverting to empty database and exiting\n"; |
1204 |
$self->clear_database(); |
1205 |
return 0; |
1206 |
} |
1207 |
} |
1208 |
defined $line || $!==0 or |
1209 |
$!==EBADF ? dbg("BDB: error reading dump file: $!") |
1210 |
: die "error reading dump file: $!"; |
1211 |
close(DUMPFILE) or die "Can't close dump file: $!"; |
1212 |
|
1213 |
print STDERR "\n" if ($showdots); |
1214 |
|
1215 |
unless (defined($num_spam)) { |
1216 |
#dbg("BDB: unable to find num spam, please check file"); |
1217 |
$error_p = 1; |
1218 |
} |
1219 |
|
1220 |
unless (defined($num_ham)) { |
1221 |
#dbg("BDB: unable to find num ham, please check file"); |
1222 |
$error_p = 1; |
1223 |
} |
1224 |
|
1225 |
if ($error_p) { |
1226 |
#dbg("BDB: error(s) while attempting to load $filename, clearing database, correct and re-run"); |
1227 |
$self->clear_database(); |
1228 |
return 0; |
1229 |
} |
1230 |
|
1231 |
if ($num_spam || $num_ham) { |
1232 |
unless ($self->nspam_nham_change($num_spam, $num_ham)) { |
1233 |
#dbg("BDB: error updating num spam and num ham, clearing database"); |
1234 |
$self->clear_database(); |
1235 |
return 0; |
1236 |
} |
1237 |
} |
1238 |
|
1239 |
#dbg("BDB: parsed $line_count lines"); |
1240 |
#dbg("BDB: created database with $token_count tokens based on $num_spam spam messages and $num_ham ham messages"); |
1241 |
|
1242 |
$self->untie_db(); |
1243 |
|
1244 |
return 1; |
1245 |
} |
1246 |
|
1247 |
=head2 db_readable |
1248 |
|
1249 |
public instance (Boolean) db_readable() |
1250 |
|
1251 |
Description: |
1252 |
This method returns a boolean value indicating if the database is in a |
1253 |
readable state. |
1254 |
|
1255 |
=cut |
1256 |
|
1257 |
sub db_readable { |
1258 |
#dbg("BDB: Entering db_readable"); |
1259 |
my($self) = @_; |
1260 |
return $self->{already_tied}; |
1261 |
} |
1262 |
|
1263 |
=head2 db_writable |
1264 |
|
1265 |
public instance (Boolean) db_writeable() |
1266 |
|
1267 |
Description: |
1268 |
This method returns a boolean value indicating if the database is in a |
1269 |
writable state. |
1270 |
|
1271 |
=cut |
1272 |
|
1273 |
sub db_writable { |
1274 |
#dbg("BDB: Entering db_writeable"); |
1275 |
my($self) = @_; |
1276 |
return($self->{already_tied} and $self->{is_locked}); |
1277 |
} |
1278 |
|
1279 |
=head2 _extract_atime |
1280 |
|
1281 |
private instance () _extract_atime (String $token, |
1282 |
String $value, |
1283 |
String $index) |
1284 |
|
1285 |
Description: |
1286 |
This method ensures that the database connetion is properly setup and |
1287 |
working. If appropriate it will initialize a users bayes variables so |
1288 |
that they can begin using the database immediately. |
1289 |
|
1290 |
=cut |
1291 |
|
1292 |
sub _extract_atime { |
1293 |
#dbg("BDB: Entering _extract_atime"); |
1294 |
my ($token, $value) = @_; |
1295 |
my($ts, $th, $atime) = _unpack_token($value); |
1296 |
#dbg("BDB: _extract_atime found $atime for $token"); |
1297 |
$_[2] = $atime; |
1298 |
#dbg("BDB: Leaving db_writeable"); |
1299 |
return 0; |
1300 |
} |
1301 |
|
1302 |
=head2 _put_token |
1303 |
|
1304 |
FIXME: This is rarely a good interface, because of the churn that will |
1305 |
often happen in the "magic" tokens. Open-code this stuff in the |
1306 |
presence of loops. |
1307 |
|
1308 |
=cut |
1309 |
|
1310 |
sub _put_token { |
1311 |
#dbg("BDB: Entering _put_token"); |
1312 |
my($self, $token, $ts, $th, $atime) = @_; |
1313 |
|
1314 |
$ts ||= 0; |
1315 |
$th ||= 0; |
1316 |
|
1317 |
#dbg("BDB: $token has spam $ts, ham $th, atime $atime"); |
1318 |
|
1319 |
my $value = $self->_get(tokens => $token, $rmw); |
1320 |
|
1321 |
my $exists_already = defined $value ? 1 : 0; |
1322 |
|
1323 |
#dbg("BDB: $token exists: $exists_already"); |
1324 |
if ($ts == 0 && $th == 0) { |
1325 |
return unless($exists_already); # If the token doesn't exist, just return |
1326 |
my $ntokens = $self->_get(vars => "NTOKENS", $rmw); |
1327 |
$self->{handles}->{vars}->db_put(NTOKENS => --$ntokens) and die "Couldn't put record: $BerkeleyDB::Error"; |
1328 |
#dbg("BDB: ntokens is $ntokens"); |
1329 |
|
1330 |
my $status = $self->{handles}->{tokens}->db_del($token); |
1331 |
|
1332 |
die "Couldn't delete record: $BerkeleyDB::Error" unless ($status == 0 or $status == DB_NOTFOUND); |
1333 |
#dbg("BDB: $token deleted"); |
1334 |
} else { |
1335 |
unless($exists_already) { # If the token doesn't exist, raise the token count |
1336 |
my $ntokens = $self->_get(vars => "NTOKENS", $rmw); |
1337 |
$self->{handles}->{vars}->db_put(NTOKENS => ++$ntokens) and die "Couldn't put record: $BerkeleyDB::Error"; |
1338 |
#dbg("BDB: ntokens is $ntokens"); |
1339 |
} |
1340 |
|
1341 |
my $newmagic = $self->_get(vars => "NEWEST_TOKEN_AGE", $rmw) || 0; |
1342 |
#dbg("BDB: NEWEST_TOKEN_AGE is $newmagic"); |
1343 |
|
1344 |
if ($atime > $newmagic) { |
1345 |
#dbg("BDB: Updating NEWEST_TOKEN_AGE"); |
1346 |
$self->{handles}->{vars}->db_put(NEWEST_TOKEN_AGE => $atime) and die "Couldn't put record: $BerkeleyDB::Error"; |
1347 |
} |
1348 |
|
1349 |
my $oldmagic = $self->_get(vars => "OLDEST_TOKEN_AGE", $rmw) || time; |
1350 |
#dbg("BDB: OLDEST_TOKEN_AGE is $oldmagic"); |
1351 |
if ($atime and $atime < $oldmagic) { |
1352 |
#dbg("BDB: Updating OLDEST_TOKEN_AGE to $atime"); |
1353 |
$self->{handles}->{vars}->db_put(OLDEST_TOKEN_AGE => $atime) and die "Couldn't put record: $BerkeleyDB::Error"; |
1354 |
} |
1355 |
|
1356 |
my $value = _pack_token($ts, $th, $atime); |
1357 |
|
1358 |
#dbg("BDB: Setting $token to $value"); |
1359 |
#dbg("BDB: Handle is $self->{handles}->{tokens}"); |
1360 |
|
1361 |
$self->{handles}->{tokens}->db_put($token, $value) and die "Couldn't put record: $BerkeleyDB::Error"; |
1362 |
} |
1363 |
|
1364 |
#dbg("BDB: Leaving _put_token"); |
1365 |
return 1; |
1366 |
} |
1367 |
|
1368 |
# token marshalling format for tokens. |
1369 |
|
1370 |
# Since we may have many entries with few hits, especially thousands of hapaxes |
1371 |
# (1-occurrence entries), use a flexible entry format, instead of simply "2 |
1372 |
# packed ints", to keep the memory and disk space usage down. In my |
1373 |
# 18k-message test corpus, only 8.9% have >= 8 hits in either counter, so we |
1374 |
# can use a 1-byte representation for the other 91% of low-hitting entries |
1375 |
# and save masses of space. |
1376 |
|
1377 |
# This looks like: XXSSSHHH (XX = format bits, SSS = 3 spam-count bits, HHH = 3 |
1378 |
# ham-count bits). If XX in the first byte is 11, it's packed as this 1-byte |
1379 |
# representation; otherwise, if XX in the first byte is 00, it's packed as |
1380 |
# "CLL", ie. 1 byte and 2 32-bit "longs" in perl pack format. |
1381 |
|
1382 |
# Savings: roughly halves size of toks db, at the cost of a ~10% slowdown. |
1383 |
|
1384 |
use constant FORMAT_FLAG => 0xc0; # 11000000 |
1385 |
use constant ONE_BYTE_FORMAT => 0xc0; # 11000000 |
1386 |
use constant TWO_LONGS_FORMAT => 0x00; # 00000000 |
1387 |
|
1388 |
use constant ONE_BYTE_SSS_BITS => 0x38; # 00111000 |
1389 |
use constant ONE_BYTE_HHH_BITS => 0x07; # 00000111 |
1390 |
|
1391 |
sub _unpack_token { |
1392 |
my $value = shift || 0; |
1393 |
|
1394 |
my($packed, $ts, $th, $atime) = unpack("CVVV", $value); |
1395 |
|
1396 |
if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) { |
1397 |
return (($packed & ONE_BYTE_SSS_BITS) >> 3, |
1398 |
$packed & ONE_BYTE_HHH_BITS, |
1399 |
$ts || 0); # The one-byte-format uses that first 32-bit long as atime |
1400 |
} elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) { |
1401 |
return ($ts || 0, $th || 0, $atime || 0); |
1402 |
} else { |
1403 |
warn "BDB: unknown packing format for bayes db, please re-learn: $packed"; |
1404 |
return (0, 0, 0); |
1405 |
} |
1406 |
} |
1407 |
|
1408 |
sub _pack_token { |
1409 |
my($ts, $th, $atime) = @_; |
1410 |
$ts ||= 0; $th ||= 0; $atime ||= 0; |
1411 |
if ($ts < 8 && $th < 8) { |
1412 |
return pack("CV", (ONE_BYTE_FORMAT | ($ts << 3) | $th) & 255, $atime); |
1413 |
} else { |
1414 |
return pack("CVVV", TWO_LONGS_FORMAT, $ts, $th, $atime); |
1415 |
} |
1416 |
} |
1417 |
|
1418 |
sub _get { |
1419 |
my ($self, $table, $key, $flags) = @_; |
1420 |
|
1421 |
$flags |= 0; |
1422 |
|
1423 |
my $value = ""; |
1424 |
|
1425 |
my $status = $self->{handles}->{$table}->db_get($key => $value, $flags); |
1426 |
|
1427 |
if ($status == 0) { |
1428 |
return $value; |
1429 |
} elsif ($status == DB_NOTFOUND) { |
1430 |
return undef; |
1431 |
} else { |
1432 |
die "Couldn't get record: $BerkeleyDB::Error"; |
1433 |
} |
1434 |
} |
1435 |
|
1436 |
sub sa_die { Mail::SpamAssassin::sa_die(@_); } |
1437 |
|
1438 |
1; |
1439 |
|
1440 |
# Local Variables: |
1441 |
# mode: CPerl |
1442 |
# cperl-indent-level: 2 |
1443 |
# End: |