View | Details | Raw Unified | Return to bug 6046
Collapse All | Expand All

(-)MANIFEST (+2 lines)
Lines 33-38 Link Here
33
lib/Mail/SpamAssassin/Bayes/CombineChi.pm
33
lib/Mail/SpamAssassin/Bayes/CombineChi.pm
34
lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm
34
lib/Mail/SpamAssassin/Bayes/CombineNaiveBayes.pm
35
lib/Mail/SpamAssassin/BayesStore.pm
35
lib/Mail/SpamAssassin/BayesStore.pm
36
lib/Mail/SpamAssassin/BayesStore/BDB.pm
36
lib/Mail/SpamAssassin/BayesStore/DBM.pm
37
lib/Mail/SpamAssassin/BayesStore/DBM.pm
37
lib/Mail/SpamAssassin/BayesStore/MySQL.pm
38
lib/Mail/SpamAssassin/BayesStore/MySQL.pm
38
lib/Mail/SpamAssassin/BayesStore/PgSQL.pm
39
lib/Mail/SpamAssassin/BayesStore/PgSQL.pm
Lines 203-208 Link Here
203
t/SATest.pm
204
t/SATest.pm
204
t/basic_lint.t
205
t/basic_lint.t
205
t/basic_obj_api.t
206
t/basic_obj_api.t
207
t/bayesbdb.t
206
t/bayesdbm.t
208
t/bayesdbm.t
207
t/bayesdbm_flock.t
209
t/bayesdbm_flock.t
208
t/bayessdbm.t
210
t/bayessdbm.t
(-)lib/Mail/SpamAssassin/BayesStore/BDB.pm (+1443 lines)
Line 0 Link Here
1
# <@LICENSE>
2
# Licensed to the Apache Software Foundation (ASF) under one or more
3
# contributor license agreements.  See the NOTICE file distributed with
4
# this work for additional information regarding copyright ownership.
5
# The ASF licenses this file to you under the Apache License, Version 2.0
6
# (the "License"); you may not use this file except in compliance with
7
# the License.  You may obtain a copy of the License at:
8
#
9
#     http://www.apache.org/licenses/LICENSE-2.0
10
#
11
# Unless required by applicable law or agreed to in writing, software
12
# distributed under the License is distributed on an "AS IS" BASIS,
13
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
# See the License for the specific language governing permissions and
15
# limitations under the License.
16
# </@LICENSE>
17
18
=head1 NAME
19
20
Mail::SpamAssassin::BayesStore::BDB - BerkeleyDB Bayesian Storage Module Implementation
21
22
=head1 SYNOPSIS
23
24
=head1 DESCRIPTION
25
26
This module implementes a BDB based bayesian storage module.
27
28
=cut
29
30
package Mail::SpamAssassin::BayesStore::BDB;
31
32
use strict;
33
use warnings;
34
use bytes;
35
use re 'taint';
36
use Errno qw(EBADF);
37
use Data::Dumper;
38
use Digest::SHA1 qw{sha1};
39
use File::Basename;
40
use File::Path;
41
42
use Mail::SpamAssassin::BayesStore;
43
use Mail::SpamAssassin::Logger;
44
45
use vars qw( @ISA );
46
47
@ISA = qw( Mail::SpamAssassin::BayesStore );
48
49
use constant HAS_BDB => eval { require BerkeleyDB; BerkeleyDB->import; };
50
51
my $rmw = DB_RMW;
52
my $next = DB_NEXT;
53
54
=head1 METHODS
55
56
=head2 new
57
58
public class (Mail::SpamAssassin::BayesStore::SQL) new (Mail::Spamassassin::Plugin::Bayes $bayes)
59
60
Description:
61
This methods creates a new instance of the Mail::SpamAssassin::BayesStore::BDB
62
object.  It expects to be passed an instance of the Mail::SpamAssassin:Bayes
63
object which is passed into the Mail::SpamAssassin::BayesStore parent object.
64
65
=cut
66
67
sub new {
68
  my $class = shift;
69
  $class = ref($class) || $class;
70
  my $self = $class->SUPER::new(@_);
71
  $self->{supported_db_version} = 3;
72
  $self->{already_tied} = 0;
73
  $self->{is_locked} = 0;
74
  return $self;
75
}
76
77
=head2 tie_db_readonly
78
79
public instance (Boolean) tie_db_readonly ();
80
81
Description:
82
This method ensures that the database connection is properly setup and
83
working.  It takes 'read-only' very seriously, and will not try to
84
initialize anything.
85
86
=cut
87
88
sub tie_db_readonly {
89
  my($self) = @_;
90
  #dbg("BDB: tie_db_readonly");
91
  my $result = ($self->{already_tied} and $self->{is_locked} == 0) || $self->_tie_db(0);
92
  #dbg("BDB: tie_db_readonly result is $result");
93
  return $result;
94
}
95
96
=head2 tie_db_writable
97
98
public instance (Boolean) tie_db_writable ()
99
100
Description:
101
This method ensures that the database connetion is properly setup and
102
working. If necessary it will initialize the database so that they can
103
begin using the database immediately.
104
105
=cut
106
107
sub tie_db_writable {
108
  my($self) = @_;
109
  #dbg("BDB: tie_db_writable");
110
  my $result = ($self->{already_tied} and $self->{is_locked} == 1) || $self->_tie_db(1);
111
  #dbg("BDB: tie_db_writable result is $result");
112
  return $result;
113
}
114
115
=head2 _tie_db
116
117
private instance (Boolean) _tie_db (Boolean $writeable)
118
119
Description:
120
This method ensures that the database connetion is properly setup and
121
working.  If it will initialize a users bayes variables so that they
122
can begin using the database immediately.
123
124
=cut
125
126
sub _tie_db {
127
  my($self, $writeable) = @_;
128
129
  #dbg("BDB: _tie_db($writeable)");
130
131
  # Always notice state changes
132
  $self->{is_locked} = $writeable;
133
134
  return 1 if($self->{already_tied});
135
136
  #dbg("BDB: not already tied");
137
138
  my $main = $self->{bayes}->{main};
139
140
  if (!defined($main->{conf}->{bayes_path})) {
141
    #dbg("BDB: bayes_path not defined");
142
    return 0;
143
  }
144
145
  #dbg("BDB: Reading db configs");
146
  $self->read_db_configs();
147
148
  my $path = dirname $main->sed_path($main->{conf}->{bayes_path});
149
150
  #dbg("BDB: Path is $path");
151
  # Path must exist or we must be in writeable mode
152
  if (-d $path) {
153
    # All is cool
154
  } elsif ($writeable) {
155
    # Create the path
156
    eval {
157
      mkpath($path, 0, (oct($main->{conf}->{bayes_file_mode}) & 0777));
158
    };
159
    warn("BDB: Couldn't create path: $@") if ($@);
160
  } else {
161
    # FAIL
162
    warn("BDB: bayes_path doesn't exist and can't create: $path");
163
    return 0;
164
  }
165
166
  # Now we can set up our environment
167
  my $flags = DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN;
168
  $flags |= DB_CREATE if($writeable);
169
  # DB_REGISTER|DB_RECOVER|
170
171
  #dbg("BDB: Creating environment: $path, $flags, $main->{conf}->{bayes_file_mode}");
172
  unless ($self->{env} = BerkeleyDB::Env->new(-Cachesize => 67108864, -Home => $path, -Flags => $flags, -Mode =>(oct($main->{conf}->{bayes_file_mode}) & 0666), -SetFlags => DB_LOG_AUTOREMOVE)) {
173
    #dbg("BDB: berkeleydb environment couldn't initialize: $BerkeleyDB::Error");
174
    return 0;
175
  }
176
177
  $flags = $writeable ? DB_CREATE : 0;
178
179
  #dbg("BDB: Opening vars");
180
  unless ($self->{handles}->{vars} = BerkeleyDB::Btree->new(-Env => $self->{env}, -Filename => "vars.db", -Flags => $flags)) {
181
    warn("BDB: couldn't open vars.db: $BerkeleyDB::Error");
182
    $self->untie_db;
183
    return 0;
184
  }
185
186
  #dbg("BDB: Looking for db_version");
187
  unless ($self->{db_version} = $self->_get(vars => "DB_VERSION")) {
188
    if ($writeable) {
189
      $self->{db_version} = $self->DB_VERSION;
190
      $self->{handles}->{vars}->db_put(DB_VERSION => $self->{db_version}) and die "Couldn't put record: $BerkeleyDB::Error";
191
      $self->{handles}->{vars}->db_put(NTOKENS => 0) and die "Couldn't put record: $BerkeleyDB::Error";
192
      #dbg("BDB: new db, set db version " . $self->{db_version} . " and 0 tokens");
193
    } else {
194
      warn("BDB: vars.db not intialized: $BerkeleyDB::Error");
195
      $self->untie_db;
196
      return 0;
197
    }
198
  } elsif ($self->{db_version}) {
199
    #dbg("BDB: found bayes db version $self->{db_version}");
200
    if ($self->{db_version} != $self->DB_VERSION) {
201
      warn("BDB: bayes db version $self->{db_version} is not able to be used, aborting: $BerkeleyDB::Error");
202
      $self->untie_db();
203
      return 0;
204
    }
205
  }
206
207
  #dbg("BDB: Opening tokens");
208
  unless ($self->{handles}->{tokens} = BerkeleyDB::Btree->new(-Env => $self->{env}, -Filename => "tokens.db", -Flags => $flags, -Property => DB_REVSPLITOFF)) {
209
    warn("BDB: couldn't open tokens.db: $BerkeleyDB::Error");
210
    $self->untie_db;
211
    return 0;
212
  }
213
214
  #dbg("BDB: Opening atime secondary DB");
215
  unless ($self->{handles}->{atime} = BerkeleyDB::Btree->new(-Env => $self->{env}, -Filename => "atime.db", -Flags => $flags, -Property => DB_DUP|DB_DUPSORT)) {
216
    warn("BDB: couldn't open atime.db: $BerkeleyDB::Error");
217
    $self->untie_db;
218
    return 0;
219
  }
220
221
  #dbg("BDB: Opening seen DB");
222
  unless ($self->{handles}->{seen} = BerkeleyDB::Btree->new(-Env => $self->{env}, -Filename => "seen.db", -Flags => $flags)) {
223
    warn("BDB: couldn't open tokens.db: $BerkeleyDB::Error");
224
    $self->untie_db;
225
    return 0;
226
  }
227
228
  # This MUST be outside the transaction that opens the DB, or it just doesn't work.  Dunno Why.
229
  $self->{handles}->{tokens}->associate($self->{handles}->{atime}, \&_extract_atime) and die "Couldn't associate DBs: $BerkeleyDB::Error";
230
231
  $self->{already_tied} = 1;
232
233
  return 1;
234
}
235
236
=head2 untie_db
237
238
public instance () untie_db ()
239
240
Description:
241
Closes any open db handles.  You can safely call this at any time.
242
243
=cut
244
245
sub untie_db {
246
  my $self = shift;
247
248
  $self->{is_locked} = 0;
249
  $self->{already_tied} = 0;
250
  $self->{db_version} = undef;
251
252
  for my $handle (keys %{$self->{handles}}) {
253
    # Since we are using transactions, this should be fine
254
    $self->{handles}->{$handle}->db_close (DB_NOSYNC);
255
    delete $self->{handles}->{$handle};
256
  }
257
258
  $self->{env}->txn_checkpoint (128, 1) if $self->{env};
259
260
  delete $self->{env};
261
  return undef;
262
}
263
264
=head2 calculate_expire_delta
265
266
public instance (%) calculate_expire_delta (Integer $newest_atime,
267
                                            Integer $start,
268
                                            Integer $max_expire_mult)
269
270
Description:
271
This method performs a calculation on the data to determine the
272
optimum atime for token expiration.
273
274
=cut
275
276
sub calculate_expire_delta {
277
  #dbg("BDB: calculate_expire_delta starting");
278
  my($self, $newest_atime, $start, $max_expire_mult) = @_;
279
280
  my %delta;    # use a hash since an array is going to be very sparse
281
282
  my $cursor = $self->{handles}->{atime}->db_cursor or die "Couldn't get cursor: $BerkeleyDB::Error";
283
284
  my($atime, $value) = ("", "");
285
286
  # Do the first pass, figure out atime delta by iterating over our
287
  # *secondary* index, avoiding the decoding overhead
288
  while ($cursor->c_get($atime, $value, $next) == 0) {
289
290
    # Go through from $start * 1 to $start * 512, mark how many tokens we would expire
291
    my $age = $newest_atime - $atime;
292
    for (my $i = 1; $i <= $max_expire_mult; $i <<= 1) {
293
      if ($age >= $start * $i) {
294
        $delta{$i}++;
295
      } else {
296
        # If the token age is less than the expire delta, it'll be
297
        # less for all upcoming checks too, so abort early.
298
        last;
299
      }
300
    }
301
  }
302
303
  $cursor->c_close and die "Couldn't close cursor: $BerkeleyDB::Error";
304
  undef $cursor;
305
306
  #dbg("BDB: calculate_expire_delta done");
307
  return %delta;
308
}
309
310
=head2 token_expiration
311
312
public instance (Integer, Integer,
313
                 Integer, Integer) token_expiration (\% $opts,
314
                                                     Integer $newdelta,
315
                                                     @ @vars)
316
317
Description:
318
This method performs the database specific expiration of tokens based on
319
the passed in C<$newdelta> and C<@vars>.
320
321
=cut
322
323
sub token_expiration {
324
  #dbg("BDB: Entering token_expiration");
325
  my($self, $opts, $newdelta, @vars) = @_;
326
327
  my($kept, $deleted, $hapaxes, $lowfreq) = (0, 0, 0, 0);
328
329
  # Reset stray too-new tokens
330
  {
331
    my $cursor = $self->{handles}->{atime}->db_cursor or die "Couldn't get cursor: $BerkeleyDB::Error";
332
333
    # Grab the token for a tight RWM loop
334
    my($atime, $flag) = ($vars[10], DB_SET_RANGE|$rmw);
335
    # Find the first token eq or gt the current newest
336
    while ($cursor->c_pget($atime, my $token, my $value, $flag) == 0) {
337
      my($ts, $th, $current) = _unpack_token($value);
338
      $self->{handles}->{tokens}->db_put($token, _pack_token($ts, $th, $atime)) and die "Couldn't put record: $BerkeleyDB::Error";
339
      $flag = $next|$rmw; # We need to adjust our flag to continue on from the first rec
340
    }
341
342
    $cursor->c_close and die "Couldn't close cursor: $BerkeleyDB::Error";
343
    undef $cursor;
344
  }
345
346
  # Figure out how old is too old...
347
  my $too_old = $vars[10] - $newdelta; # tooold = newest - delta
348
  #dbg("BDB: Too old is $too_old");
349
350
  #dbg("BDB: Getting db stats");
351
  my $count;
352
353
  # Estimate the number of keys to be deleted
354
  {
355
    my $stats = $self->{handles}->{atime}->db_stat(DB_FAST_STAT);
356
    #dbg("DBD: Stats: " . Dumper $stats);
357
    # Scan if we've never gotten stats before 
358
    $stats = $self->{handles}->{atime}->db_stat if($stats->{bt_ndata} == 0);
359
    #dbg("DBD: Stats: " . Dumper $stats);
360
    if ($self->{handles}->{atime}->db_key_range($too_old, my $less, my $equal, my $greater) == 0) {
361
      #dbg("DBD: less is $less, equal is $equal, greater is $greater");
362
      $count = $stats->{bt_ndata} - $stats->{bt_ndata} * $greater;
363
    }
364
  }
365
366
  #dbg("BDB: Considering deleting $vars[3], $count");
367
368
  # As long as too many tokens wouldn't be deleted
369
  if ($vars[3] - $count >= 100000) {
370
371
    #dbg("BDB: Preparing to iterate");
372
373
    my $cursor = $self->{handles}->{atime}->db_cursor or die "Couldn't get cursor: $BerkeleyDB::Error";
374
375
    my ($atime, $oldest, $token, $value);
376
377
    $atime = 0;
378
379
    while ($cursor->c_pget($atime, $token, $value, $next) == 0) {
380
      # We're traversing in order, so done
381
      $oldest = $atime, last if($atime >= $too_old);
382
      #dbg("BDB: Deleting record");
383
      $cursor->c_del;
384
      $deleted++;
385
      my($ts, $th, $atime) = _unpack_token($value);
386
      if ($ts + $th == 1) {
387
        $hapaxes++;
388
      } elsif ($ts < 8 && $th < 8) {
389
        $lowfreq++;
390
      }
391
    }
392
393
    #dbg("BDB: Done with cursor");
394
    $cursor->c_close and die "Couldn't close cursor: $BerkeleyDB::Error";
395
    undef $cursor;
396
397
    $kept = $self->_get (vars => "NTOKENS", $rmw) - $deleted;
398
    $self->{handles}->{vars}->db_put(NTOKENS => $kept) and die "Couldn't put record: $BerkeleyDB::Error";
399
    $self->{handles}->{vars}->db_put(LAST_EXPIRE => time) and die "Couldn't put record: $BerkeleyDB::Error";
400
    $self->{handles}->{vars}->db_put(OLDEST_TOKEN_AGE => $oldest) and die "Couldn't put record: $BerkeleyDB::Error";
401
    $self->{handles}->{vars}->db_put(LAST_EXPIRE_REDUCE =>  $deleted) and die "Couldn't put record: $BerkeleyDB::Error";
402
    $self->{handles}->{vars}->db_put(LAST_ATIME_DELTA => $newdelta) and die "Couldn't put record: $BerkeleyDB::Error";
403
404
    #$self->{handles}->{atime}->compact;
405
    #$self->{handles}->{tokens}->compact;
406
    #$self->{handles}->{vars}->compact;
407
408
  } else {
409
    #dbg("BDB: Update vars to regenerate histogram");
410
    # Make sure we regenerate our histogramn
411
    $kept = $self->_get(vars => "NTOKENS");
412
    $self->{handles}->{vars}->db_put(LAST_EXPIRE => time) and die "Couldn't put record: $BerkeleyDB::Error";
413
    $self->{handles}->{vars}->db_put(LAST_ATIME_DELTA => 0) and die "Couldn't put record: $BerkeleyDB::Error";
414
    $self->{handles}->{vars}->db_put(LAST_EXPIRE_REDUCE => 0) and die "Couldn't put record: $BerkeleyDB::Error";
415
  }
416
417
  #dbg("BDB: token_expiration done");
418
  return($kept, $deleted, $hapaxes, $lowfreq);
419
}
420
421
=head2 sync_due
422
423
public instance (Boolean) sync_due ()
424
425
Description:
426
This method determines if a database sync is currently required.
427
428
Unused for BDB implementation.
429
430
=cut
431
432
sub sync_due {
433
  return 0;
434
}
435
436
=head2 seen_get
437
438
public instance (String) seen_get (string $msgid)
439
440
Description:
441
This method retrieves the stored value, if any, for C<$msgid>.  The return value
442
is the stored string ('s' for spam and 'h' for ham) or undef if C<$msgid> is not
443
found.
444
445
=cut
446
447
sub seen_get {
448
  #dbg("BDB: Entering seen_get");
449
  my($self, $msgid) = @_;
450
451
  my $value = $self->_get(seen => $msgid);
452
453
  return $value;
454
}
455
456
=head2 seen_put
457
458
public (Boolean) seen_put (string $msgid, char $flag)
459
460
Description:
461
This method records C<$msgid> as the type given by C<$flag>.  C<$flag> is one of
462
two values 's' for spam and 'h' for ham.
463
464
=cut
465
466
sub seen_put {
467
  #dbg("BDB: Entering seen_put");
468
  my($self, $msgid, $flag) = @_;
469
470
  $self->{handles}->{seen}->db_put($msgid, $flag) and die "Couldn't put record: $BerkeleyDB::Error";
471
472
  return 1;
473
}
474
475
=head2 seen_delete
476
477
public instance (Boolean) seen_delete (string $msgid)
478
479
Description:
480
This method removes C<$msgid> from the database.
481
482
=cut
483
484
sub seen_delete {
485
  #dbg("BDB: Entering seen_delete");
486
  my($self, $msgid) = @_;
487
488
  my $result;
489
490
  my $status = $self->{handles}->{seen}->db_del($msgid);
491
492
  if ($status == 0) {
493
    $result = 1;
494
  } elsif ($status == DB_NOTFOUND) {
495
    $result = 0E0;
496
  } else {
497
    die "Couldn't delete record: $BerkeleyDB::Error";
498
  }
499
500
  return $result;
501
}
502
503
=head2 get_storage_variables
504
505
public instance (@) get_storage_variables ()
506
507
Description:
508
This method retrieves the various administrative variables used by
509
the Bayes process and database.
510
511
The values returned in the array are in the following order:
512
513
0: scan count base
514
515
1: number of spam
516
517
2: number of ham
518
519
3: number of tokens in db
520
521
4: last expire atime
522
523
5: oldest token in db atime
524
525
6: db version value
526
527
7: last journal sync
528
529
8: last atime delta
530
531
9: last expire reduction count
532
533
10: newest token in db atime
534
535
=cut
536
537
sub get_storage_variables {
538
  #dbg("BDB: get_storage_variables starting");
539
  my($self) = @_;
540
541
  my @values;
542
  for my $token (qw{LAST_JOURNAL_SYNC NSPAM NHAM NTOKENS LAST_EXPIRE OLDEST_TOKEN_AGE DB_VERSION LAST_JOURNAL_SYNC LAST_ATIME_DELTA LAST_EXPIRE_REDUCE NEWEST_TOKEN_AGE}) {
543
    my $value = $self->_get (vars => $token);
544
    $value = 0 unless($value and $value =~ /\d+/);
545
    push @values, $value;
546
  }
547
548
  #dbg("BDB: get_storage_variables done");
549
  return @values;
550
}
551
552
=head2 dump_tokens
553
554
public instance () dump_tokens (String $template, String $regex, Array @vars)
555
556
Description:
557
This method loops over all tokens, computing the probability for the token and then
558
printing it out according to the passed in token.
559
560
=cut
561
562
sub dump_tokens {
563
  #dbg("BDB: dump_tokens starting");
564
  my($self, $template, $regex, @vars) = @_;
565
566
  my $cursor = $self->{handles}->{tokens}->db_cursor or die "Couldn't get cursor: $BerkeleyDB::Error";
567
  my ($token, $value) = ("", "");
568
  while ($cursor->c_get($token, $value, $next) == 0) {
569
    next if(defined $regex && ($token !~ /$regex/o));
570
    my($ts, $th, $atime) = _unpack_token($value);
571
    my $prob = $self->{bayes}->compute_prob_for_token($token, $vars[1], $vars[2], $ts, $th) || 0.5;
572
    my $encoded = unpack("H*",$token);
573
    printf $template, $prob, $ts, $th, $atime, $encoded;
574
  }
575
576
  $cursor->c_close and die "Couldn't close cursor: $BerkeleyDB::Error";
577
  undef $cursor;
578
579
  #dbg("BDB: dump_tokens done");
580
  return 1;
581
}
582
583
=head2 set_last_expire
584
585
public instance (Boolean) set_last_expire (Integer $time)
586
587
Description:
588
This method sets the last expire time.
589
590
=cut
591
592
sub set_last_expire {
593
  #dbg("BDB: Entering set_last_expire");
594
  my($self, $time) = @_;
595
  $self->{handles}->{vars}->db_put(LAST_EXPIRE => $time) and die "Couldn't put record: $BerkeleyDB::Error";
596
  return 1;
597
}
598
599
=head2 get_running_expire_tok
600
601
public instance (String $time) get_running_expire_tok ()
602
603
Description:
604
This method determines if an expire is currently running and returns
605
the last time set.
606
607
There can be multiple times, so we just pull the greatest (most recent)
608
value.
609
610
=cut
611
612
sub get_running_expire_tok {
613
  #dbg("BDB: Entering get_running_expire_tok");
614
  my($self) = @_;
615
616
  my $value = $self->_get (vars => "RUNNING_EXPIRE") || "";
617
  my $result = $value if $value =~ /^\d+$/;
618
619
  #dbg("BDB: get_running_expire_tok exiting with $result");
620
  return $result;
621
}
622
623
=head2 set_running_expire_tok
624
625
public instance (String $time) set_running_expire_tok ()
626
627
Description:
628
This method sets the time that an expire starts running.
629
630
=cut
631
632
sub set_running_expire_tok {
633
  my($self) = @_;
634
635
  my $time = time;
636
  $self->{handles}->{vars}->db_put(RUNNING_EXPIRE => $time) and die "Couldn't put record: $BerkeleyDB::Error";
637
638
  return $time;
639
}
640
641
=head2 remove_running_expire_tok
642
643
public instance (Boolean) remove_running_expire_tok ()
644
645
Description:
646
This method removes the row in the database that indicates that
647
and expire is currently running.
648
649
=cut
650
651
sub remove_running_expire_tok {
652
  my($self) = @_;
653
654
  my $status = $self->{handles}->{vars}->db_del("RUNNING_EXPIRE");
655
656
  my $result;
657
658
  if ($status == 0) {
659
    $result = 1;
660
  } elsif ($status == DB_NOTFOUND) {
661
    $result = 0E0;
662
  } else {
663
    die "Couldn't delete record: $BerkeleyDB::Error";
664
  }
665
666
  return $result;
667
}
668
669
=head2 tok_get
670
671
public instance (Integer, Integer, Integer) tok_get (String $token)
672
673
Description:
674
This method retrieves a specificed token (C<$token>) from the database
675
and returns it's spam_count, ham_count and last access time.
676
677
=cut
678
679
sub tok_get {
680
  #dbg("BDB: Entering tok_get");
681
  my($self, $token) = @_;
682
  my $array = $self->tok_get_all ([$token]);
683
  return @{$array->[0]};
684
}
685
686
=head2 tok_get_all
687
688
public instance (\@) tok_get (@ $tokens)
689
690
Description:
691
This method retrieves the specified tokens (C<$tokens>) from storage and returns
692
an array ref of arrays spam count, ham acount and last access time.
693
694
=cut
695
696
sub tok_get_all {
697
  #dbg("BDB: Entering tok_get_all");
698
  my($self, @keys) = @_;
699
700
  my @values;
701
  for my $token (@keys) {
702
    if (my $value = $self->_get(seen => $token)) {
703
      push(@values, [$token, _unpack_token($value)]);
704
    }
705
  }
706
707
  # #dbg("BDB: tok_get_all returning with " . Dump \@values);
708
  return \@values;
709
}
710
711
=head2 tok_count_change
712
713
public instance (Boolean) tok_count_change (Integer $dspam,
714
					    Integer $dham,
715
					    String $token,
716
					    String $newatime)
717
718
Description:
719
This method takes a C<$spam_count> and C<$ham_count> and adds it to
720
C<$tok> along with updating C<$tok>s atime with C<$atime>.
721
722
=cut
723
724
sub tok_count_change {
725
  #dbg("BDB: Entering tok_count_change");
726
  my($self, $dspam, $dham, $token, $newatime) = @_;
727
  $self->multi_tok_count_change ($dspam, $dham, {$token => 1}, $newatime);
728
}
729
730
=head2 multi_tok_count_change
731
732
public instance (Boolean) multi_tok_count_change (Integer $dspam,
733
 					          Integer $dham,
734
				 	          \% $tokens,
735
					          String $newatime)
736
737
Description:
738
This method takes a C<$dspam> and C<$dham> and adds it to all of the
739
tokens in the C<$tokens> hash ref along with updating each tokens
740
atime with C<$atime>.
741
742
=cut
743
744
sub multi_tok_count_change {
745
  my($self, $dspam, $dham, $tokens, $newatime) = @_;
746
747
  # Make sure we have some values
748
  $dspam ||= 0;
749
  $dham ||= 0;
750
  $newatime ||= 0;
751
752
  # No changes, just return
753
  return 1 unless ($dspam or $dham);
754
755
  # Collect this for updates at the end
756
  my $newtokens = 0;
757
758
  for my $token (keys %{$tokens}) {
759
    my $status = $self->{handles}->{tokens}->db_get($token => my $value, $rmw);
760
761
    if ($status == 0) {
762
      my ($spam, $ham, $oldatime) = _unpack_token ($value);
763
      $spam += $dspam;
764
      $spam = 0 if ($spam < 0);
765
      $ham += $dham;
766
      $ham = 0 if ($ham < 0);
767
      my $newvalue = _pack_token($spam, $ham, $newatime);
768
      $self->{handles}->{tokens}->db_put($token => $newvalue) and die "Couldn't put record: $BerkeleyDB::Error";
769
    }
770
771
    elsif ($status == DB_NOTFOUND) {
772
      my $spam = $dspam;
773
      $spam = 0 if ($spam < 0);
774
      my $ham = $dham;
775
      $ham = 0 if ($ham < 0);
776
      my $newvalue = _pack_token($spam, $ham, $newatime);
777
      $self->{handles}->{tokens}->db_put($token => $newvalue) and die "Couldn't put record: $BerkeleyDB::Error";
778
      $newtokens++;
779
    }
780
781
    else {
782
      die "Couldn't get record: $BerkeleyDB::Error";
783
    }
784
  }
785
786
  if ($newtokens) {
787
    my $ntokens = $self->_get(vars => "NTOKENS", $rmw) || 0;
788
    $ntokens += $newtokens;
789
    $ntokens = 0 if ($ntokens < 0);
790
    $self->{handles}->{vars}->db_put(NTOKENS => $ntokens) and die "Couldn't put record: $BerkeleyDB::Error";
791
  }
792
793
  my $newmagic = $self->_get(vars => "NEWEST_TOKEN_AGE", $rmw) || 0;
794
  if ($newatime > $newmagic) {
795
    $self->{handles}->{vars}->db_put(NEWEST_TOKEN_AGE => $newatime) and die "Couldn't put record: $BerkeleyDB::Error";
796
  }
797
798
  my $oldmagic = $self->_get(vars => "OLDEST_TOKEN_AGE", $rmw) || time;
799
  if ($newatime and $newatime < $oldmagic) {
800
    $self->{handles}->{vars}->db_put(OLDEST_TOKEN_AGE => $newatime) and die "Couldn't put record: $BerkeleyDB::Error";
801
  }
802
803
  return 1;
804
}
805
806
=head2 nspam_nham_get
807
808
public instance ($spam_count, $ham_count) nspam_nham_get ()
809
810
Description:
811
This method retrieves the total number of spam and the total number of
812
ham learned.
813
814
=cut
815
816
sub nspam_nham_get {
817
  #dbg("BDB: Entering nspam_nham_get");
818
  my($self) = @_;
819
  my @vars = $self->get_storage_variables();
820
  ($vars[1], $vars[2]);
821
}
822
823
=head2 nspam_nham_change
824
825
public instance (Boolean) nspam_nham_change (Integer $num_spam,
826
                                             Integer $num_ham)
827
828
Description:
829
This method updates the number of spam and the number of ham in the database.
830
831
=cut
832
833
sub nspam_nham_change {
834
  my($self, $ds, $dh) = @_;
835
836
  my $nspam = $self->_get(vars => "NSPAM", $rmw) || 0;
837
  $nspam += ($ds || 0);
838
  $nspam = 0 if ($nspam < 0);
839
  $self->{handles}->{vars}->db_put(NSPAM => $nspam) and die "Couldn't put record: $BerkeleyDB::Error";
840
841
  my $nham = $self->_get(vars => "NHAM", $rmw) || 0;
842
  $nham += ($dh || 0);
843
  $nham = 0 if ($nham < 0);
844
  $self->{handles}->{vars}->db_put(NHAM => $nham) and die "Couldn't put record: $BerkeleyDB::Error";
845
846
  return 1;
847
}
848
849
=head2 tok_touch
850
851
public instance (Boolean) tok_touch (String $token,
852
                                     String $atime)
853
854
Description:
855
This method updates the given tokens (C<$token>) atime.
856
857
The assumption is that the token already exists in the database.
858
859
We will never update to an older atime
860
861
=cut
862
863
sub tok_touch {
864
  my($self, $token, $atime) = @_;
865
  return $self->tok_touch_all ([$token], $atime);
866
}
867
868
=head2 tok_touch_all
869
870
public instance (Boolean) tok_touch (\@ $tokens
871
                                     String $atime)
872
873
Description:
874
This method does a mass update of the given list of tokens C<$tokens>,
875
if the existing token atime is < C<$atime>.
876
877
The assumption is that the tokens already exist in the database.
878
879
We should never be touching more than N_SIGNIFICANT_TOKENS, so we can
880
make some assumptions about how to handle the data (ie no need to
881
batch like we do in tok_get_all)
882
883
=cut
884
885
sub tok_touch_all {
886
  my($self, $tokens, $newatime) = @_;
887
888
  for my $token (@{$tokens}) {
889
    my $status = $self->{handles}->{tokens}->db_get($token => my $value, $rmw);
890
    if ($status == 0) {
891
      my ($spam, $ham, $oldatime) = _unpack_token ($value);
892
      my $newvalue = _pack_token ($spam, $ham, $newatime);
893
      $self->{handles}->{tokens}->db_put($token => $newvalue) and die "Couldn't put record: $BerkeleyDB::Error";
894
    }
895
896
    elsif ($status == DB_NOTFOUND) {
897
      # Do nothing
898
    }
899
900
    else {
901
      die "Couldn't get record: $BerkeleyDB::Error";
902
    }
903
  }
904
905
  return 1;
906
}
907
908
=head2 cleanup
909
910
public instance (Boolean) cleanup ()
911
912
Description:
913
This method peroms any cleanup necessary before moving onto the next
914
operation.
915
916
=cut
917
918
sub cleanup {
919
  my ($self) = @_;
920
  #dbg("Running cleanup");
921
  return 1;
922
}
923
924
=head2 get_magic_re
925
926
public instance (String) get_magic_re ()
927
928
Description:
929
This method returns a regexp which indicates a magic token.
930
931
Unused in BDB implementation.
932
933
=cut
934
935
use constant get_magic_re => undef;
936
937
=head2 sync
938
939
public instance (Boolean) sync (\% $opts)
940
941
Description:
942
This method performs a sync of the database
943
944
=cut
945
946
sub sync { 
947
  my($self, $opts) = @_;
948
  #dbg("Running sync");
949
  return 1;
950
}
951
952
=head2 perform_upgrade
953
954
public instance (Boolean) perform_upgrade (\% $opts);
955
956
Description:
957
Performs an upgrade of the database from one version to another, not
958
currently used in this implementation.
959
960
=cut
961
962
sub perform_upgrade {
963
  #dbg("BDB: Entering perform_upgrade");
964
  return 1;
965
}
966
967
=head2 clear_database
968
969
public instance (Boolean) clear_database ()
970
971
Description:
972
This method deletes all records for a particular user.
973
974
Callers should be aware that any errors returned by this method
975
could causes the database to be inconsistent for the given user.
976
977
=cut
978
979
sub clear_database {
980
  #dbg("BDB: Entering clear_database");
981
  my($self) = @_;
982
983
  $self->untie_db();
984
  #dbg("BDB: removing db.");
985
  my $main = $self->{bayes}->{main};
986
  my $path = $main->sed_path($main->{conf}->{bayes_path});
987
  eval {rmpath($path)};
988
  return 1;
989
}
990
991
=head2 backup_database
992
993
public instance (Boolean) backup_database ()
994
995
Description:
996
This method will dump the users database in a machine readable format.
997
998
=cut
999
1000
sub backup_database {
1001
  #dbg("BDB: Entering backup_database");
1002
  my($self) = @_;
1003
  return 0 unless $self->tie_db_writable;
1004
  my @vars = $self->get_storage_variables;
1005
1006
  print "v\t$vars[6]\tdb_version # this must be the first line!!!\n";
1007
  print "v\t$vars[1]\tnum_spam\n";
1008
  print "v\t$vars[2]\tnum_nonspam\n";
1009
1010
  my $tokens = $self->{handles}->{tokens}->db_cursor or die "Couldn't get cursor: $BerkeleyDB::Error";
1011
1012
  my($token, $value) = ("", "");
1013
  while ($tokens->c_get($token, $value, $next) == 0) {
1014
    my($ts, $th, $atime) = _unpack_token($value);
1015
    my $encoded = unpack("H*", $token);
1016
    print "t\t$ts\t$th\t$atime\t$encoded\n";
1017
  }
1018
1019
  $tokens->c_close and die "Couldn't close cursor: $BerkeleyDB::Error";
1020
  undef $tokens;
1021
1022
  my $seen = $self->{handles}->{seen}->db_cursor or die "Couldn't get cursor: $BerkeleyDB::Error";
1023
1024
  $token = "";
1025
  while ($seen->c_get($token, $value, $next) == 0) {
1026
    print "s\t$token\t$value\n";
1027
  }
1028
1029
  $seen->c_close and die "Couldn't close cursor: $BerkeleyDB::Error";
1030
  undef $seen;
1031
1032
  $self->untie_db();
1033
1034
  return 1;
1035
}
1036
1037
=head2 restore_database
1038
1039
public instance (Boolean) restore_database (String $filename, Boolean $showdots)
1040
1041
Description:
1042
This method restores a database from the given filename, C<$filename>.
1043
1044
Callers should be aware that any errors returned by this method
1045
could causes the database to be inconsistent for the given user.
1046
1047
=cut
1048
1049
sub restore_database {
1050
  #dbg("BDB: Entering restore_database");
1051
  my ($self, $filename, $showdots) = @_;
1052
1053
  local *DUMPFILE;
1054
  if (!open(DUMPFILE, '<', $filename)) {
1055
    #dbg("BDB: unable to open backup file $filename: $!");
1056
    return 0;
1057
  }
1058
1059
  # This is the critical phase (moving sql around), so don't allow it
1060
  # to be interrupted.
1061
  local $SIG{'INT'} = 'IGNORE';
1062
  local $SIG{'HUP'} = 'IGNORE' if (!Mail::SpamAssassin::Util::am_running_on_windows());
1063
  local $SIG{'TERM'} = 'IGNORE';
1064
1065
  unless ($self->clear_database()) {
1066
    return 0;
1067
  }
1068
1069
  # we need to go ahead close the db connection so we can then open it up
1070
  # in a fresh state after clearing
1071
  $self->untie_db();
1072
1073
  unless ($self->tie_db_writable()) {
1074
    return 0;
1075
  }
1076
1077
  my $token_count = 0;
1078
  my $db_version;
1079
  my $num_spam;
1080
  my $num_ham;
1081
  my $error_p = 0;
1082
  my $line_count = 0;
1083
1084
  my $line = <DUMPFILE>;
1085
  defined $line  or die "Error reading dump file: $!";
1086
  $line_count++;
1087
  # We require the database version line to be the first in the file so we can
1088
  # figure out how to properly deal with the file.  If it is not the first
1089
  # line then fail
1090
  if ($line =~ m/^v\s+(\d+)\s+db_version/) {
1091
    $db_version = $1;
1092
  } else {
1093
    #dbg("BDB: database version must be the first line in the backup file, correct and re-run");
1094
    return 0;
1095
  }
1096
1097
  unless ($db_version == 2 || $db_version == 3) {
1098
    warn("BDB: database version $db_version is unsupported, must be version 2 or 3");
1099
    return 0;
1100
  }
1101
1102
  my $token_error_count = 0;
1103
  my $seen_error_count = 0;
1104
1105
  for ($!=0; defined($line=<DUMPFILE>); $!=0) {
1106
    chomp($line);
1107
    $line_count++;
1108
1109
    if ($line_count % 1000 == 0) {
1110
      print STDERR "." if ($showdots);
1111
    }
1112
1113
    if ($line =~ /^v\s+/) {     # variable line
1114
      my @parsed_line = split(/\s+/, $line, 3);
1115
      my $value = $parsed_line[1] + 0;
1116
      if ($parsed_line[2] eq 'num_spam') {
1117
	$num_spam = $value;
1118
      } elsif ($parsed_line[2] eq 'num_nonspam') {
1119
	$num_ham = $value;
1120
      } else {
1121
	#dbg("BDB: restore_database: skipping unknown line: $line");
1122
      }
1123
    } elsif ($line =~ /^t\s+/) { # token line
1124
      my @parsed_line = split(/\s+/, $line, 5);
1125
      my $spam_count = $parsed_line[1] + 0;
1126
      my $ham_count = $parsed_line[2] + 0;
1127
      my $atime = $parsed_line[3] + 0;
1128
      my $token = $parsed_line[4];
1129
1130
      my $token_warn_p = 0;
1131
      my @warnings;
1132
1133
      if ($spam_count < 0) {
1134
	$spam_count = 0;
1135
	push(@warnings, 'spam count < 0, resetting');
1136
	$token_warn_p = 1;
1137
      }
1138
      if ($ham_count < 0) {
1139
	$ham_count = 0;
1140
	push(@warnings, 'ham count < 0, resetting');
1141
	$token_warn_p = 1;
1142
      }
1143
1144
      if ($spam_count == 0 && $ham_count == 0) {
1145
	#dbg("BDB: token has zero spam and ham count, skipping");
1146
	next;
1147
      }
1148
1149
      if ($atime > time()) {
1150
	$atime = time();
1151
	push(@warnings, 'atime > current time, resetting');
1152
	$token_warn_p = 1;
1153
      }
1154
1155
      if ($token_warn_p) {
1156
	#dbg("BDB: token ($token) has the following warnings:\n".join("\n",@warnings));
1157
      }
1158
1159
      if ($db_version < 3) {
1160
	# versions < 3 use plain text tokens, so we need to convert to hash
1161
	$token = substr(sha1($token), -5);
1162
      } else {
1163
	# turn unpacked binary token back into binary value
1164
	$token = pack("H*",$token);
1165
      }
1166
1167
      unless ($self->_put_token($token, $spam_count, $ham_count, $atime)) {
1168
	#dbg("BDB: error inserting token for line: $line");
1169
	$token_error_count++;
1170
      }
1171
      $token_count++;
1172
    } elsif ($line =~ /^s\s+/) { # seen line
1173
      my @parsed_line = split(/\s+/, $line, 3);
1174
      my $flag = $parsed_line[1];
1175
      my $msgid = $parsed_line[2];
1176
1177
      unless ($flag eq 'h' || $flag eq 's') {
1178
	#dbg("BDB: unknown seen flag ($flag) for line: $line, skipping");
1179
	next;
1180
      }
1181
1182
      unless ($msgid) {
1183
	#dbg("BDB: blank msgid for line: $line, skipping");
1184
	next;
1185
      }
1186
1187
      unless ($self->seen_put($msgid, $flag)) {
1188
	#dbg("BDB: error inserting msgid in seen table for line: $line");
1189
	$seen_error_count++;
1190
      }
1191
    } else {
1192
      #dbg("BDB: skipping unknown line: $line");
1193
      next;
1194
    }
1195
1196
    if ($token_error_count >= 20) {
1197
      warn "BDB: encountered too many errors (20) while parsing token line, reverting to empty database and exiting\n";
1198
      $self->clear_database();
1199
      return 0;
1200
    }
1201
1202
    if ($seen_error_count >= 20) {
1203
      warn "BDB: encountered too many errors (20) while parsing seen lines, reverting to empty database and exiting\n";
1204
      $self->clear_database();
1205
      return 0;
1206
    }
1207
  }
1208
  defined $line || $!==0  or
1209
    $!==EBADF ? dbg("BDB: error reading dump file: $!")
1210
      : die "error reading dump file: $!";
1211
  close(DUMPFILE) or die "Can't close dump file: $!";
1212
1213
  print STDERR "\n" if ($showdots);
1214
1215
  unless (defined($num_spam)) {
1216
    #dbg("BDB: unable to find num spam, please check file");
1217
    $error_p = 1;
1218
  }
1219
1220
  unless (defined($num_ham)) {
1221
    #dbg("BDB: unable to find num ham, please check file");
1222
    $error_p = 1;
1223
  }
1224
1225
  if ($error_p) {
1226
    #dbg("BDB: error(s) while attempting to load $filename, clearing database, correct and re-run");
1227
    $self->clear_database();
1228
    return 0;
1229
  }
1230
1231
  if ($num_spam || $num_ham) {
1232
    unless ($self->nspam_nham_change($num_spam, $num_ham)) {
1233
      #dbg("BDB: error updating num spam and num ham, clearing database");
1234
      $self->clear_database();
1235
      return 0;
1236
    }
1237
  }
1238
1239
  #dbg("BDB: parsed $line_count lines");
1240
  #dbg("BDB: created database with $token_count tokens based on $num_spam spam messages and $num_ham ham messages");
1241
1242
  $self->untie_db();
1243
1244
  return 1;
1245
}
1246
1247
=head2 db_readable
1248
1249
public instance (Boolean) db_readable()
1250
1251
Description:
1252
This method returns a boolean value indicating if the database is in a
1253
readable state.
1254
1255
=cut
1256
1257
sub db_readable {
1258
  #dbg("BDB: Entering db_readable");
1259
  my($self) = @_;
1260
  return $self->{already_tied};
1261
}
1262
1263
=head2 db_writable
1264
1265
public instance (Boolean) db_writeable()
1266
1267
Description:
1268
This method returns a boolean value indicating if the database is in a
1269
writable state.
1270
1271
=cut
1272
1273
sub db_writable {
1274
  #dbg("BDB: Entering db_writeable");
1275
  my($self) = @_;
1276
  return($self->{already_tied} and $self->{is_locked});
1277
}
1278
1279
=head2 _extract_atime
1280
1281
private instance () _extract_atime (String $token,
1282
                                    String $value,
1283
                                    String $index)
1284
1285
Description:
1286
This method ensures that the database connetion is properly setup and
1287
working. If appropriate it will initialize a users bayes variables so
1288
that they can begin using the database immediately.
1289
1290
=cut
1291
1292
sub _extract_atime {
1293
  #dbg("BDB: Entering _extract_atime");
1294
  my ($token, $value) = @_;
1295
  my($ts, $th, $atime) = _unpack_token($value);
1296
  #dbg("BDB: _extract_atime found $atime for $token");
1297
  $_[2] = $atime;
1298
  #dbg("BDB: Leaving db_writeable");
1299
  return 0;
1300
}
1301
1302
=head2 _put_token
1303
1304
FIXME: This is rarely a good interface, because of the churn that will
1305
often happen in the "magic" tokens.  Open-code this stuff in the
1306
presence of loops.
1307
1308
=cut
1309
1310
sub _put_token {
1311
  #dbg("BDB: Entering _put_token");
1312
  my($self, $token, $ts, $th, $atime) = @_;
1313
1314
  $ts ||= 0;
1315
  $th ||= 0;
1316
1317
  #dbg("BDB: $token has spam $ts, ham $th, atime $atime");
1318
1319
  my $value = $self->_get(tokens => $token, $rmw);
1320
1321
  my $exists_already = defined $value ? 1 : 0;
1322
1323
  #dbg("BDB: $token exists: $exists_already");
1324
  if ($ts == 0 && $th == 0) {
1325
    return unless($exists_already); # If the token doesn't exist, just return
1326
    my $ntokens = $self->_get(vars => "NTOKENS", $rmw);
1327
    $self->{handles}->{vars}->db_put(NTOKENS => --$ntokens) and die "Couldn't put record: $BerkeleyDB::Error";
1328
    #dbg("BDB: ntokens is $ntokens");
1329
1330
    my $status = $self->{handles}->{tokens}->db_del($token);
1331
1332
    die "Couldn't delete record: $BerkeleyDB::Error" unless ($status == 0 or $status == DB_NOTFOUND);
1333
    #dbg("BDB: $token deleted");
1334
  } else {
1335
    unless($exists_already) { # If the token doesn't exist, raise the token count
1336
      my $ntokens = $self->_get(vars => "NTOKENS", $rmw);
1337
      $self->{handles}->{vars}->db_put(NTOKENS => ++$ntokens) and die "Couldn't put record: $BerkeleyDB::Error";
1338
      #dbg("BDB: ntokens is $ntokens");
1339
    }
1340
1341
    my $newmagic = $self->_get(vars => "NEWEST_TOKEN_AGE", $rmw) || 0;
1342
    #dbg("BDB: NEWEST_TOKEN_AGE is $newmagic");
1343
1344
    if ($atime > $newmagic) {
1345
      #dbg("BDB: Updating NEWEST_TOKEN_AGE");
1346
      $self->{handles}->{vars}->db_put(NEWEST_TOKEN_AGE => $atime) and die "Couldn't put record: $BerkeleyDB::Error";
1347
    }
1348
1349
    my $oldmagic = $self->_get(vars => "OLDEST_TOKEN_AGE", $rmw) || time;
1350
    #dbg("BDB: OLDEST_TOKEN_AGE is $oldmagic");
1351
    if ($atime and $atime < $oldmagic) {
1352
      #dbg("BDB: Updating OLDEST_TOKEN_AGE to $atime");
1353
      $self->{handles}->{vars}->db_put(OLDEST_TOKEN_AGE => $atime) and die "Couldn't put record: $BerkeleyDB::Error";
1354
    }
1355
1356
    my $value = _pack_token($ts, $th, $atime);
1357
1358
    #dbg("BDB: Setting $token to $value");
1359
    #dbg("BDB: Handle is $self->{handles}->{tokens}");
1360
1361
    $self->{handles}->{tokens}->db_put($token, $value) and die "Couldn't put record: $BerkeleyDB::Error";
1362
  }
1363
1364
  #dbg("BDB: Leaving _put_token");
1365
  return 1;
1366
}
1367
1368
# token marshalling format for tokens.
1369
1370
# Since we may have many entries with few hits, especially thousands of hapaxes
1371
# (1-occurrence entries), use a flexible entry format, instead of simply "2
1372
# packed ints", to keep the memory and disk space usage down.  In my
1373
# 18k-message test corpus, only 8.9% have >= 8 hits in either counter, so we
1374
# can use a 1-byte representation for the other 91% of low-hitting entries
1375
# and save masses of space.
1376
1377
# This looks like: XXSSSHHH (XX = format bits, SSS = 3 spam-count bits, HHH = 3
1378
# ham-count bits).  If XX in the first byte is 11, it's packed as this 1-byte
1379
# representation; otherwise, if XX in the first byte is 00, it's packed as
1380
# "CLL", ie. 1 byte and 2 32-bit "longs" in perl pack format.
1381
1382
# Savings: roughly halves size of toks db, at the cost of a ~10% slowdown.
1383
1384
use constant FORMAT_FLAG	=> 0xc0; # 11000000
1385
use constant ONE_BYTE_FORMAT	=> 0xc0; # 11000000
1386
use constant TWO_LONGS_FORMAT	=> 0x00; # 00000000
1387
1388
use constant ONE_BYTE_SSS_BITS	=> 0x38; # 00111000
1389
use constant ONE_BYTE_HHH_BITS	=> 0x07; # 00000111
1390
1391
sub _unpack_token {
1392
  my $value = shift || 0;
1393
1394
  my($packed, $ts, $th, $atime) = unpack("CVVV", $value);
1395
1396
  if (($packed & FORMAT_FLAG) == ONE_BYTE_FORMAT) {
1397
    return (($packed & ONE_BYTE_SSS_BITS) >> 3,
1398
            $packed & ONE_BYTE_HHH_BITS,
1399
            $ts || 0); # The one-byte-format uses that first 32-bit long as atime
1400
  } elsif (($packed & FORMAT_FLAG) == TWO_LONGS_FORMAT) {
1401
    return ($ts || 0, $th || 0, $atime || 0);
1402
  } else {
1403
    warn "BDB: unknown packing format for bayes db, please re-learn: $packed";
1404
    return (0, 0, 0);
1405
  }
1406
}
1407
1408
sub _pack_token {
1409
  my($ts, $th, $atime) = @_;
1410
  $ts ||= 0; $th ||= 0; $atime ||= 0;
1411
  if ($ts < 8 && $th < 8) {
1412
    return pack("CV", (ONE_BYTE_FORMAT | ($ts << 3) | $th) & 255, $atime);
1413
  } else {
1414
    return pack("CVVV", TWO_LONGS_FORMAT, $ts, $th, $atime);
1415
  }
1416
}
1417
1418
sub _get {
1419
  my ($self, $table, $key, $flags) = @_;
1420
1421
  $flags |= 0;
1422
1423
  my $value = "";
1424
1425
  my $status = $self->{handles}->{$table}->db_get($key => $value, $flags);
1426
1427
  if ($status == 0) {
1428
    return $value;
1429
  } elsif ($status == DB_NOTFOUND) {
1430
    return undef;
1431
  } else {
1432
    die "Couldn't get record: $BerkeleyDB::Error";
1433
  }
1434
}
1435
1436
sub sa_die { Mail::SpamAssassin::sa_die(@_); }
1437
1438
1;
1439
1440
# Local Variables:
1441
# mode: CPerl
1442
# cperl-indent-level: 2
1443
# End:
(-)masses/bayes-testing/benchmark/README (-24 / +15 lines)
Lines 1-24 Link Here
1
Here is a quick benchmark that I worked up to help test changes to the
1
This is a quick benchmark to help test changes to the bayes storage
2
bayes storage system.  It also happens to work as a benchmark for
2
system.  It also happens to work as a benchmark for other changes.  It
3
other changes.  It requires that you install the code you wish to
3
requires that you install the code you wish to benchmark, but upgrades
4
benchmark, but of course you're welcome to upgrade the scripts to make
4
to make the scripts more in-tree friendly would be accepted.
5
them more in-tree friendly.
5
6
You will also need Proc::Background.
6
7
7
Quick Start:
8
Quick Start:
8
9
9
You need to install some perl modules:
10
Create 8 buckets for your mail (mbox files), 4 each of ham and spam.
10
11
They should be in rough date order (oldest in bucket 1).  Name them
11
Mail::Box::Manager
12
hambucketX.mbox and spambucketX.mbox where X is the bucket number.
12
Proc::Background
13
14
Feel free to rewrite the runmulti.pl and runmbox.pl code to get
15
rid of those modules.  Maybe one day when Mail::SpamAssassin::Client
16
gets finished this will be possible.
17
18
Create 6 buckets for your mail (mbox files).  3 should be ham, 3
19
should be spam.  They should be in rough date order (oldest in bucket
20
1).  Name them hambucketX.mbox and spambucketX.mbox where X is the
21
bucket number.
22
13
23
I suggest at least 1000 messages per bucket, for sure it should not be
14
I suggest at least 1000 messages per bucket, for sure it should not be
24
less than 200, and maybe even 300 depending on how much autolearning
15
less than 200, and maybe even 300 depending on how much autolearning
Lines 49-58 Link Here
49
  This is the learning phase, here we run sa-learn on hambucket1.mbox
40
  This is the learning phase, here we run sa-learn on hambucket1.mbox
50
  and spambucket1.mbox, getting the timings for each.
41
  and spambucket1.mbox, getting the timings for each.
51
42
52
Phase 2:
43
Phase 2: This is the spamd scanning phase.  We startup a spamd and
53
  This is the spamd scanning phase.  We startup a spamd and then
44
  then startup a forking script that throws all messages in
54
  startup a forking script that throws all messages in hambucket2.mbox
45
  hambucket2.mbox, hambucket3.mbox, spambucket2.mbox and
55
  and spambucket2.mbox at the daemon using spamc.
46
  spambucket4.mbod at the daemon using spamc.
56
47
57
  After this is done it does an sa-learn --sync and an
48
  After this is done it does an sa-learn --sync and an
58
  sa-learn --force-expire.
49
  sa-learn --force-expire.
Lines 64-71 Link Here
64
55
65
Phase 4:
56
Phase 4:
66
  This is the spamassassin scan phase.  Here we scan the
57
  This is the spamassassin scan phase.  Here we scan the
67
  hambucket3.mbox and then the spambucket3.mbox using the spamassassin
58
  hambucket4.mbox and then the spambucket4.mbox using the spamassassin
68
  script.
59
  script.
69
60
70
I suggest running each benchmark 3 times to make sure your test is not
61
I suggest running each benchmark 3 times to make sure your test is not
71
influenced by other system activities too much.
62
influenced by other system activities too much.
(-)masses/bayes-testing/benchmark/helper/bdb/cleardb (+5 lines)
Line 0 Link Here
1
#!/bin/bash
2
3
DBDIR=$1/dbdir
4
echo "[Removing BDB bayes Database directory - $DBDIR]"
5
rm -rf $DBDIR
(-)masses/bayes-testing/benchmark/helper/bdb/dbsize (+5 lines)
Line 0 Link Here
1
#!/bin/bash
2
3
DBDIR=$1/dbdir
4
echo "[Determining BDB bayes database size - $DBDIR]"
5
du -sk $DBDIR
(-)masses/bayes-testing/benchmark/helper/bdb/setup (+8 lines)
Line 0 Link Here
1
#!/bin/bash
2
3
BAYESPATH=$1/dbdir/bayes
4
5
mkdir -p $(dirname $BAYESPATH)
6
7
echo "[Replacing BAYESPATH in $1/site/local.cf file]"
8
sed -i -e "s:@@BAYESPATH@@:$BAYESPATH:" $1/site/local.cf
(-)masses/bayes-testing/benchmark/tests/bdb/site/init.pre (+6 lines)
Line 0 Link Here
1
# AutoLearnThreshold - threshold-based discriminator for Bayes auto-learning
2
#
3
loadplugin Mail::SpamAssassin::Plugin::Check
4
loadplugin Mail::SpamAssassin::Plugin::Bayes
5
loadplugin Mail::SpamAssassin::Plugin::AWL
6
loadplugin Mail::SpamAssassin::Plugin::AutoLearnThreshold
(-)masses/bayes-testing/benchmark/tests/bdb/site/local.cf (+8 lines)
Line 0 Link Here
1
bayes_store_module	Mail::SpamAssassin::BayesStore::BDB
2
3
use_auto_whitelist	0
4
5
bayes_auto_expire	0
6
7
bayes_path @@BAYESPATH@@
8
(-)masses/bayes-testing/benchmark/tests/bdb/user_prefs (+1 lines)
Line 0 Link Here
1
(-)t/bayesbdb.t (+252 lines)
Line 0 Link Here
1
#!/usr/bin/perl
2
3
use Data::Dumper;
4
use lib '.'; use lib 't';
5
use SATest; sa_t_init("bayes");
6
use Test;
7
8
use constant TEST_ENABLED => conf_bool('run_long_tests') &&
9
                            eval { require BerkeleyDB; $BerkeleyDB::db_version >= 4.6; };
10
11
BEGIN { 
12
  if (-e 't/test_dir') {
13
    chdir 't';
14
  }
15
16
  if (-e 'test_dir') {
17
    unshift(@INC, '../blib/lib');
18
  }
19
20
  plan tests => (TEST_ENABLED ? 42 : 0);
21
};
22
23
exit unless TEST_ENABLED;
24
25
tstlocalrules ("
26
        bayes_store_module Mail::SpamAssassin::BayesStore::BDB
27
");
28
29
use Mail::SpamAssassin;
30
31
my $sa = create_saobj();
32
33
$sa->init();
34
35
sub getimpl {
36
  return $sa->call_plugins("learner_get_implementation");
37
}
38
ok($sa);
39
40
ok ($sa->{bayes_scanner} && getimpl);
41
42
ok(!$sa->{bayes_scanner}->is_scan_available());
43
44
open(MAIL,"< data/spam/001");
45
46
my $raw_message = do {
47
  local $/;
48
  <MAIL>;
49
};
50
51
close(MAIL);
52
ok($raw_message);
53
54
my $mail = $sa->parse( $raw_message );
55
56
ok($mail);
57
58
my $body = getimpl->get_body_from_msg($mail);
59
60
ok($body);
61
62
my $toks = getimpl->tokenize($mail, $body);
63
64
ok(scalar(keys %{$toks}) > 0);
65
66
my($msgid,$msgid_hdr) = getimpl->get_msgid($mail);
67
68
# $msgid is the generated hash messageid
69
# $msgid_hdr is the Message-Id header
70
ok($msgid eq 'ce33e4a8bc5798c65428d6018380bae346c7c126@sa_generated')
71
    or warn "got: [$msgid]";
72
ok($msgid_hdr eq '9PS291LhupY');
73
74
ok(getimpl->{store}->tie_db_writable());
75
76
ok(!getimpl->{store}->seen_get($msgid));
77
78
getimpl->{store}->untie_db();
79
80
ok($sa->{bayes_scanner}->learn(1, $mail));
81
82
ok(!$sa->{bayes_scanner}->learn(1, $mail));
83
84
ok(getimpl->{store}->tie_db_writable());
85
86
ok(getimpl->{store}->seen_get($msgid) eq 's');
87
88
getimpl->{store}->untie_db();
89
90
ok(getimpl->{store}->tie_db_writable());
91
92
my $tokerror = 0;
93
foreach my $tok (keys %{$toks}) {
94
  my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok);
95
  if ($spam == 0 || $ham > 0) {
96
    $tokerror = 1;
97
  }
98
}
99
ok(!$tokerror);
100
101
my $tokens = getimpl->{store}->tok_get_all(keys %{$toks});
102
103
ok($tokens);
104
105
$tokerror = 0;
106
foreach my $tok (@{$tokens}) {
107
  my ($token, $tok_spam, $tok_ham, $atime) = @{$tok};
108
  if ($tok_spam == 0 || $tok_ham > 0) {
109
    $tokerror = 1;
110
  }
111
}
112
ok(!$tokerror);
113
114
getimpl->{store}->untie_db();
115
116
ok($sa->{bayes_scanner}->learn(0, $mail));
117
118
ok(getimpl->{store}->tie_db_writable());
119
120
ok(getimpl->{store}->seen_get($msgid) eq 'h');
121
122
getimpl->{store}->untie_db();
123
124
ok(getimpl->{store}->tie_db_writable());
125
126
$tokerror = 0;
127
foreach my $tok (keys %{$toks}) {
128
  my ($spam, $ham, $atime) = getimpl->{store}->tok_get($tok);
129
  if ($spam  > 0 || $ham == 0) {
130
    $tokerror = 1;
131
  }
132
}
133
ok(!$tokerror);
134
135
getimpl->{store}->untie_db();
136
137
ok($sa->{bayes_scanner}->forget($mail));
138
139
ok(getimpl->{store}->tie_db_writable());
140
141
ok(!getimpl->{store}->seen_get($msgid));
142
143
getimpl->{store}->untie_db();
144
145
undef $sa;
146
147
sa_t_init('bayes'); # this wipes out what is there and begins anew
148
149
# make sure we learn to a journal
150
tstlocalrules ("
151
bayes_min_spam_num 10
152
bayes_min_ham_num 10
153
");
154
155
# we get to bastardize the existing pattern matching code here.  It lets us provide
156
# our own checking callback and keep using the existing ok_all_patterns call
157
%patterns = ( 1 => 'Acted on message' );
158
159
ok(salearnrun("--spam data/spam", \&check_examined));
160
ok_all_patterns();
161
162
ok(salearnrun("--ham data/nice", \&check_examined));
163
ok_all_patterns();
164
165
ok(salearnrun("--ham data/whitelists", \&check_examined));
166
ok_all_patterns();
167
168
%patterns = ( 'non-token data: bayes db version' => 'db version' );
169
ok(salearnrun("--dump magic", \&patterns_run_cb));
170
ok_all_patterns();
171
172
use constant SCAN_USING_PERL_CODE_TEST => 1;
173
# jm: off! not working for some reason.   Mind you, this is
174
# not a supported way to call these APIs!  so no biggie
175
176
if (SCAN_USING_PERL_CODE_TEST) {
177
$sa = create_saobj();
178
179
$sa->init();
180
181
open(MAIL,"< ../sample-nonspam.txt");
182
183
$raw_message = do {
184
  local $/;
185
  <MAIL>;
186
};
187
188
close(MAIL);
189
190
$mail = $sa->parse( $raw_message );
191
192
$body = getimpl->get_body_from_msg($mail);
193
194
my $msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
195
196
ok($msgstatus);
197
198
my $score = getimpl->scan($msgstatus, $mail, $body);
199
200
# Pretty much we can't count on the data returned with such little training
201
# so just make sure that the score wasn't equal to .5 which is the default
202
# return value.
203
print "\treturned score: $score\n";
204
ok($score =~ /\d/ && $score <= 1.0 && $score != .5);
205
206
open(MAIL,"< ../sample-spam.txt");
207
208
$raw_message = do {
209
  local $/;
210
  <MAIL>;
211
};
212
213
close(MAIL);
214
215
$mail = $sa->parse( $raw_message );
216
217
$body = getimpl->get_body_from_msg($mail);
218
219
$msgstatus = Mail::SpamAssassin::PerMsgStatus->new($sa, $mail);
220
221
$score = getimpl->scan($msgstatus, $mail, $body);
222
223
# Pretty much we can't count on the data returned with such little training
224
# so just make sure that the score wasn't equal to .5 which is the default
225
# return value.
226
print "\treturned score: $score\n";
227
ok($score =~ /\d/ && $score <= 1.0 && $score != .5);
228
229
}
230
231
ok(getimpl->{store}->clear_database());
232
233
ok(!-e 'log/user_state/bayes/vars.db');
234
ok(!-e 'log/user_state/bayes/seen.db');
235
ok(!-e 'log/user_state/bayes/toks.db');
236
237
sub check_examined {
238
  local ($_);
239
  my $string = shift;
240
241
  if (defined $string) {
242
    $_ = $string;
243
  } else {
244
    $_ = join ('', <IN>);
245
  }
246
247
  if ($_ =~ /(?:Forgot|Learned) tokens from \d+ message\(s\) \(\d+ message\(s\) examined\)/) {
248
    $found{'Acted on message'}++;
249
  }
250
}
251
252

Return to bug 6046