#!/bin/sh TESTROOT=$HOME/sa; # Root of test heirarchy SA=$TESTROOT/Mail-SpamAssassin-2.61; # SA Tarball, extracted SALIB=$SA/lib/Mail/SpamAssassin; # Where Bayes.pm lives MASSES=$SA/masses # Where masses scripts live REALHOME=$HOME/.spamassassin; # My real home directory # (I tested on my workstation which doesn't actually # process any mail so the script clobbers stuff in there) MASSESHOME=$MASSES/spamassassin; # masses "home" directory, where mass-check's bayes_* lives # Try three bayes expiry sizes for bayessize in 150000 250000 500000 ; do # set bayes max tokens. echo bayes_expiry_max_db_size $bayessize > $HOME/.spamassassin/user_prefs # Three different bayes.pm files # dist is the Bayes.pm from the original SA tarball. # custom transforms all body tokens by sorting the letters in the words # and learning the resulting token, then removing any duplicate # (consecutive) letters and learning the resulting token. # custom2 transforms all body tokens by sorting the letters in the words # then removing any duplicate (consecutive) letters and learning # the resulting token. (custom2 doesn't learn the in-between token # which contains duplicate letters) for customtype in dist custom custom2 ; do # learn all mail older than 3 months ("old"). # Then test against all mail 3 months old or newer ("new"). # Second pass, test against all mail 30 days old or newer ("verynew") for mailage in new verynew ; do # Set Bayes implementation (with or without a customization) cp $SALIB/Bayes.$customtype.pm $SALIB/Bayes.pm; # pre-train bayes db with old spam and ham rm -f $REALHOME/bayes*; rm -f $MASSESHOME/bayes*; sa-learn --ham $TESTROOT/ham/old; sa-learn --spam $TESTROOT/spam/old; # Hack to learn mail with age between "old" and "verynew" if [ "verynew" = $mailage ] ; then sa-learn --ham $TESTROOT/ham/new_not_verynew; sa-learn --spam $TESTROOT/spam/new_not_verynew; fi sa-learn --force-expire; # try to trim the bayes DB down to $bayessize # (doesn't seem to do anything... whatever.) # Put the bayes databases into MASSESHOME/ for mass-check to use cp -R $REALHOME/bayes* $MASSESHOME/; # mass-check the mail newer than "$mailage" $MASSES/mass-check spam:dir:$TESTROOT/spam/$mailage ham:dir:$TESTROOT/ham/$mailage; # Do one hitfreq with percents and one with absolute values. $MASSES/hit-frequencies -x -p \ | grep "\(OVERALL\|all messages\|BAYES\)" \ | sort -k 7 \ > freq.percent.${bayessize}.${mailage}.${customtype}.txt; $MASSES/hit-frequencies -x \ | grep "\(OVERALL\|all messages\|BAYES\)" \ | sort -k 7 \ > freq.absolute.${bayessize}.${mailage}.${customtype}.txt; done done done