#!/usr/bin/perl # # Purpose : Find meta rule suggestions based on mass-check logs # Author : Peter Fritz unlikejam.dreamhost.com> # Date : February 2005 # use strict; my $SPAM_THRESHOLD = 5; # Upper threshold for consideration my $MIN_MSG_RULE_HITS = 3; # Msgs have at least N rule hits to be considered my $MIN_RULE_COUNT = 50; # Rules hit at least N messages to be considered my $MAX_RULES = 300; my $MAX_META_RULES = 200; # Maximum number of meta rules to generate my $MIN_OVERLAP = $MIN_MSG_RULE_HITS; # Have at least N rule hits in common my $IGNORE_RULES = qr/^(?:AWL|BAYES|[^A-Z]|T_META_)/; # Rules to ignore my $RULE_PREFIX = "T_META"; sub usage { die < as output spam rules ('90_meta_spam.cf') --hamcf=file use as output ham rules ('90_meta_ham.cf') --spamout=log rewrite to as spam log (default: 'spam.log.meta') --hamout=log rewrite to as ham log (default: 'ham.log.meta') Input options: --spamlog=log use as spam log (default: 'spam.log') --hamlog=log use as ham log (default: 'ham.log') ENDOFUSAGE } ########## use Getopt::Long; use vars qw($opt_spamcf $opt_hamcf $opt_spamlog $opt_hamlog $opt_spamout $opt_hamout $opt_prefix $opt_o $opt_q $opt_s $opt_h $opt_c $opt_m $opt_g $opt_rewrite); $opt_spamcf = "90_meta_spam.cf"; $opt_hamcf = "90_meta_ham.cf"; $opt_hamlog = "ham.log"; $opt_spamlog = "spam.log"; $opt_spamout = "spam.log.meta"; $opt_hamout = "ham.log.meta"; GetOptions( "spamcf=s", "hamcf=s", "spamlog=s", "hamlog=s", "spamout=s", "hamout=s", "prefix=s", "rewrite", "o", "q", "s=i", "h=i", "c=i", "m=i", "g=i" ) or usage(); usage() if ( $opt_hamlog eq $opt_hamout ) || ( $opt_spamlog eq $opt_spamout ); $SPAM_THRESHOLD = $opt_s if $opt_s; $MIN_RULE_COUNT = $opt_c if $opt_c; $MIN_MSG_RULE_HITS = $opt_h if $opt_h; $MAX_RULES = $opt_m if $opt_m; $MAX_META_RULES = $opt_g if $opt_g; $RULE_PREFIX = $opt_prefix if $opt_prefix; $MIN_OVERLAP = $MIN_MSG_RULE_HITS; close(STDOUT) if ( !$opt_o ); my %overlapcount; my %rulepos; # Rule index into bit vectors my %meta_rules; my @meta_order; my $vecsize = 1; print STDERR "find-meta-rules running against $opt_hamlog and $opt_spamlog ...\n" unless $opt_q; foreach my $type (qw(SPAM HAM)) { my $in_file = $type eq "HAM" ? $opt_hamlog : $opt_spamlog; my $cf_file = $type eq "HAM" ? $opt_hamcf : $opt_spamcf; %overlapcount = (); %rulepos = (); %meta_rules = (); @meta_order = (); $vecsize = 1; if ( !$opt_o && $cf_file ) { open( STDOUT, "> $cf_file" ) || die "Can't open $cf_file $!"; select(STDOUT); $| = 1; } my @low_score_log; my %rulecount; open( LOG_FILE, "< $in_file" ) || die "Can't open $in_file $!"; # Create a cache of the low scoring messages # and count rule occurences for low scoring messages while ( my $line = ) { next if $line =~ /^#/; my ( undef, $score, undef, $rulestr ) = split( ' ', $line, 5 ); next unless $rulestr; next if $score >= $SPAM_THRESHOLD; my @rules = grep( !/$IGNORE_RULES/, split( /,/, $rulestr ) ); next if scalar @rules < $MIN_MSG_RULE_HITS; push( @low_score_log, { "rules" => [@rules], "score" => $score } ); $rulecount{$_}++ foreach (@rules); } close(LOG_FILE); my $when = `date -u`; chomp $when; printf("#\n# Meta $type rule generation run $when\n"); printf( "# %5d messages that scored <%d examined\n", scalar @low_score_log, $SPAM_THRESHOLD ); printf( "# %5d individual rules found\n", scalar keys %rulecount ); # Generate a lookup for bit position index # Most common rules are LSB my $pos = 0; foreach my $r ( sort { $rulecount{$b} <=> $rulecount{$a} } keys %rulecount ) { last if $rulecount{$r} < $MIN_RULE_COUNT; last if $pos > $MAX_RULES; $rulepos{$r} = $pos++; } %rulecount = (); printf( "# %5d candidate rules found\n", scalar keys %rulepos ); $vecsize = scalar keys %rulepos; my $vec_count = 0; my %uniqvec; foreach my $msg (@low_score_log) { # Setup the bit string signature my $vec = make_rule_vector( $msg->{'rules'} ); next if unpack( "%b*", $vec ) < $MIN_OVERLAP; $msg->{'vector'} = $vec; $vec_count++; $uniqvec{$vec}++; } printf( "# %5d unique signatures found\n", scalar keys %uniqvec ); printf( "# %5d messages scored <%d and hit at least %d candidate rules\n", $vec_count, $SPAM_THRESHOLD, $MIN_MSG_RULE_HITS ); # Compare all the unique vectors against the spam signatures # Resulting intersections contribute to final meta rules foreach my $uv ( sort { $uniqvec{$b} <=> $uniqvec{$a} } keys %uniqvec ) { # Iterate over messages that have a vector foreach my $msg ( grep { exists $_->{'vector'} } @low_score_log ) { my $v = $msg->{'vector'} & $uv; # Skip if insufficient overlap next if unpack( "%b*", $v ) < $MIN_OVERLAP; # Skip if this message already hit this vector next if exists $msg->{'hit'}{$v}; $overlapcount{$v}++; $msg->{'hit'}{$v}++; } } @low_score_log = (); # Clean up the overlapcount delete @overlapcount{ grep( { $overlapcount{$_} < $MIN_RULE_COUNT } keys %overlapcount ) }; printf( "# %5d signature overlaps available\n", scalar keys %overlapcount ); # Reduce number of overlaps if required if ( scalar keys %overlapcount > $MAX_META_RULES ) { my $count = 0; my @del_key; foreach ( sort by_overlap_count keys %overlapcount ) { push( @del_key, $_ ) if $count++ > $MAX_META_RULES; } delete @overlapcount{@del_key}; printf( "# %5d signature overlaps generated\n", scalar keys %overlapcount ); } printf("#\n\n"); next if ( scalar keys %overlapcount ) <= 0; # Sorted list of vectors, most matches first my @vectors = sort { unpack( "%b*", $b ) <=> unpack( "%b*", $a ) } keys %overlapcount; # Look for smaller vectors in bigger ones my %not_rules = find_not_rules( \@vectors ); @vectors = (); my $meta_rule_num = 1; my $capture_count = 0; # Construct sub meta tests foreach my $v ( sort by_overlap_count keys %overlapcount ) { my $name = make_rule_name( 'prefix' => "$RULE_PREFIX", 'type' => $type, 'num' => sprintf( "%03d", $meta_rule_num++ ) ); # Extract the rule name based on the vector my @match_rules = extract_rules($v); my $metarule = "( " . join( ' && ', @match_rules ) . " )"; my $metadesc = "Matched " . join( ', ', @match_rules ); my ($shortdesc) = $metadesc =~ /^(.{1,48})/; $shortdesc .= "*" if length($shortdesc) > 45; $meta_rules{$v} = ( { 'name' => $name, 'rule' => $metarule, 'desc' => $metadesc, 'shortdesc' => $shortdesc, } ); $meta_rules{$v}{'not_rules'} = [ sort by_overlap_count @{ $not_rules{$v} } ] if exists $not_rules{$v}; push( @meta_order, $v ); $capture_count += $overlapcount{$v}; print <<"END_RULE"; meta $name $metarule #count $name $overlapcount{$v} messages hit this sub test END_RULE } %not_rules = (); print "\n#############\n\n"; # Construct final meta rules foreach my $v ( sort by_overlap_count keys %overlapcount ) { my $negation = ''; if ( exists $meta_rules{$v}{'not_rules'} ) { foreach ( @{ $meta_rules{$v}{'not_rules'} } ) { $negation .= " && !$meta_rules{$_}{'name'}"; } } my $name = $meta_rules{$v}{'name'}; $name =~ s/^__//; $meta_rules{$v}{'parent_name'} = $name; print <<"END_RULE"; meta $name ( $meta_rules{$v}{'name'} $negation ) #describe $name $meta_rules{$v}{'desc'} describe $name $meta_rules{$v}{'shortdesc'} #score $name 0.01 END_RULE } #print "# $capture_count potential extra rule hits (not messages)\n\n"; close(STDOUT) if ( !$opt_o ); next unless $opt_rewrite; rewrite_log( $opt_spamlog, $opt_spamout ); rewrite_log( $opt_hamlog, $opt_hamout ); } print STDERR "find-meta-rules run complete.\n" unless $opt_q; sub make_rule_name { my %info = @_; my $name = "_"; $name .= "_$info{'prefix'}" if exists $info{'prefix'}; $name .= "_$info{'type'}" if exists $info{'type'}; $name .= "_$info{'date'}" if exists $info{'date'}; $name .= "_$info{'num'}" if exists $info{'num'}; warn("Rule name construction error: $name too short\n") if length($name) < 5; return $name; } sub extract_rules { my ($vector) = @_; my @match_rules; foreach my $r ( keys %rulepos ) { push( @match_rules, $r ) if vec( $vector, $rulepos{$r}, 1 ); } return @match_rules; } sub by_overlap_count { return $overlapcount{$b} <=> $overlapcount{$a}; } sub make_rule_vector { my $rules = shift(@_); my $vec = ''; vec( $vec, $vecsize, 1 ) = 0; foreach ( @{$rules} ) { vec( $vec, $rulepos{$_}, 1 ) = 1 if exists $rulepos{$_}; } return $vec; } # Find rules which are sub rules of others sub find_not_rules { my $vectors = shift(@_); my %not_rules; for ( my $i = 0 ; $i < scalar @{$vectors} ; $i++ ) { my $vi = $vectors->[$i]; for ( my $j = $i + 1 ; $j < scalar @{$vectors} ; $j++ ) { my $vj = $vectors->[$j]; # Only interested if j is in i next if ( $vi & $vj ) ne $vj; # j is a subset of i # i can be expressed in terms of j # j should be expressed as j && !i push( @{ $not_rules{$vj} }, $vi ); } } return %not_rules; } # Rewrite the mass-check logs to include meta rule hits sub rewrite_log { my $log_file = shift(@_); my $out_file = shift(@_); open( OLDLOG, "$log_file" ) || die "Can't open $log_file $!"; open( NEWLOG, "> $out_file" ) || die "Can't open $out_file $!"; while ( my $line = ) { next if $line =~ /^#/ && print NEWLOG $line; my @data = split( /\s+/, $line, 5 ); my @rules = grep( !/$IGNORE_RULES/, split( /,/, $data[3] ) ); if ( scalar @rules >= $MIN_MSG_RULE_HITS ) { my $vec = make_rule_vector( \@rules ); if ( unpack( "%b*", $vec ) >= $MIN_MSG_RULE_HITS ) { META: foreach my $metav (@meta_order) { next META if ( $metav & $vec ) ne $metav; # OK we have a hit # Check if this rule has not_rules associated with it if ( exists $meta_rules{$metav}{'not_rules'} ) { foreach my $nv ( @{ $meta_rules{$metav}{'not_rules'} } ) { # Skip this meta rule if it has a not hit next META if ( $nv & $vec ) eq $nv; } } # Found a direct hit $data[3] .= qq(,$meta_rules{$metav}{'parent_name'}); } } } print NEWLOG join( ' ', @data ); } close(NEWLOG); close(OLDLOG); } 1;