From 964eab3b2b35256a47845f4f330cdbacee43b74d Mon Sep 17 00:00:00 2001 From: Matt Simerson Date: Fri, 22 Jun 2012 23:44:55 -0400 Subject: [PATCH] dspam: changed to only train on error per suggestions by the dspam author --- plugins/dspam | 197 ++++++++++++++++++++++++++----------------- t/plugin_tests/dspam | 31 ------- 2 files changed, 119 insertions(+), 109 deletions(-) diff --git a/plugins/dspam b/plugins/dspam index 51e067f..d80551b 100644 --- a/plugins/dspam +++ b/plugins/dspam @@ -18,13 +18,13 @@ contain a probability and confidence rating. =head1 TRAINING DSPAM -Do not just enable dspam! Its false positive rate when untrained is high. The -good news is; dspam learns very, very fast. +If you enable dspam rejection without training first, you will lose valid +mail. The dspam false positive rate is high when untrained. The good news is; +dspam learns very, very fast. -To get dspam into a useful state, it must be trained. The best method way to -train dspam is to feed it two large equal sized corpuses of spam and ham from -your mail server. The dspam authors suggest avoiding public corpuses. I train -dspam as follows: +The best method way to train dspam is to feed it two large equal sized +corpuses of spam and ham from your mail server. The dspam authors suggest +avoiding public corpuses. I train dspam as follows: =over 4 @@ -70,7 +70,7 @@ learn messages with negative karma as spam (see plugins/karma) =item spamassassin -learn from spamassassins messages with autolearn=(ham|spam) +learn from spamassassins messages with autolearn=(ham|spam). See SPAMASSASSIN. =item any @@ -135,7 +135,7 @@ after delivery (ie, users moving messages to/from spam folders), then the dspam signature must be in the headers. When using the dspam MySQL backend, use InnoDB tables. DSPAM training -is dramatically slowed by MyISAM table locks and dspam requires lots +is dramatically slowed by MyISAM table locks and dspam requires a lot of training. InnoDB has row level locking and updates are much faster. =head1 DSPAM periodic maintenance @@ -144,8 +144,6 @@ Install this cron job to clean up your DSPAM database. http://dspam.git.sourceforge.net/git/gitweb.cgi?p=dspam/dspam;a=tree;f=contrib/dspam_maintenance;hb=HEAD - - =head1 SPAMASSASSIN DSPAM can be trained by SpamAssassin. This relationship between them requires @@ -164,13 +162,14 @@ reduce the SA load. =item 2 -Autolearn must be enabled and configured in SpamAssassin. SA autolearn will +For I to work, autolearn must be enabled and +configured in SpamAssassin. SA autolearn will determine if a message is learned by dspam. The settings to pay careful attention to in your SA local.cf file are I and I. Make sure they are set to conservative values that will yield no false positives. -If you are using I and reject, messages that exceed +If you are using I and I, messages that exceed the SA threshholds will cause dspam to reject them. Again I say, make sure the SA autolearn threshholds are set high enough to avoid false positives. @@ -207,7 +206,7 @@ use IO::Handle; use Socket qw(:DEFAULT :crlf); sub register { - my ($self, $qp) = shift, shift; + my ($self, $qp) = (shift, shift); $self->log(LOGERROR, "Bad parameters for the dspam plugin") if @_ % 2; @@ -222,7 +221,6 @@ sub data_post_handler { my $self = shift; my $transaction = shift || $self->qp->transaction; - $self->autolearn( $transaction ); return (DECLINED) if $self->is_immune(); if ( $transaction->data_size > 500_000 ) { @@ -231,16 +229,18 @@ sub data_post_handler { }; my $username = $self->select_username( $transaction ); - my $filtercmd = $self->get_filter_cmd( $transaction, $username ); + my $dspam_bin = $self->{_args}{dspam_bin} || '/usr/local/bin/dspam'; + my $filtercmd = "$dspam_bin --user $username --mode=tum --process --deliver=summary --stdout"; $self->log(LOGDEBUG, $filtercmd); my $response = $self->dspam_process( $filtercmd, $transaction ); - if ( ! $response ) { + if ( ! $response->{result} ) { $self->log(LOGWARN, "skip, no dspam response. Check logs for errors."); return (DECLINED); }; $self->attach_headers( $response, $transaction ); + $self->autolearn( $response, $transaction ); return $self->log_and_return( $transaction ); }; @@ -279,8 +279,26 @@ sub assemble_message { sub dspam_process { my ( $self, $filtercmd, $transaction ) = @_; - return $self->dspam_process_backticks( $filtercmd ); - #return $self->dspam_process_open2( $filtercmd, $transaction ); + my $dspam_response = $self->dspam_process_backticks( $filtercmd ); + #my $dspam_response = $self->dspam_process_open2( $filtercmd, $transaction ); + #my $dspam_response = $self->dspam_process_fork( $filtercmd ); + + # X-DSPAM-Result: user@example.com; result="Spam"; class="Spam"; probability=1.0000; confidence=1.00; signature=N/A + # X-DSPAM-Result: smtpd; result="Innocent"; class="Innocent"; probability=0.0023; confidence=1.00; signature=4f8dae6a446008399211546 + my ($r, $p, $c, $s) + = $dspam_response + =~ /result=\"(Spam|Innocent)\";.*?probability=([\d\.]+); confidence=([\d\.]+); signature=(.*)/; + + return { + result => $r, + probability => $p, + confidence => $c, + signature => $s, + }; +}; + +sub dspam_process_fork { + my ( $self, $filtercmd, $transaction ) = @_; # yucky. This method (which forks) exercises a bug in qpsmtpd. When the # child exits, the Transaction::DESTROY method is called, which deletes @@ -305,7 +323,6 @@ sub dspam_process_backticks { my ( $self, $filtercmd ) = @_; my $filename = $self->qp->transaction->body_filename; - #my $response = `cat $filename | $filtercmd`; chomp $response; my $response = `$filtercmd < $filename`; chomp $response; $self->log(LOGDEBUG, $response); return $response; @@ -450,46 +467,11 @@ sub get_dspam_results { return \%d; }; -sub get_filter_cmd { - my ($self, $transaction, $user) = @_; - - my $dspam_bin = $self->{_args}{dspam_bin} || '/usr/local/bin/dspam'; - my $default = "$dspam_bin --user $user --mode=tum --process --deliver=summary --stdout"; - - my $learn = $self->{_args}{autolearn} or return $default; - return $default if ( $learn ne 'spamassassin' && $learn ne 'any' ); - - $self->log(LOGDEBUG, "attempting to learn from SA"); - - my $sa = $transaction->notes('spamassassin' ); - if ( ! $sa || ! $sa->{is_spam} ) { - $self->log(LOGERROR, "SA results missing"); - return $default; - }; - - if ( ! $sa->{autolearn} ) { - $self->log(LOGERROR, "SA autolearn unset"); - return $default; - }; - - if ( $sa->{is_spam} eq 'Yes' && $sa->{autolearn} eq 'spam' ) { - return "$dspam_bin --user $user --mode=tum --source=corpus --class=spam --deliver=summary --stdout"; - } - elsif ( $sa->{is_spam} eq 'No' && $sa->{autolearn} eq 'ham' ) { - return "$dspam_bin --user $user --mode=tum --source=corpus --class=innocent --deliver=summary --stdout"; - }; - - return $default; -}; - sub attach_headers { - my ($self, $response, $transaction) = @_; + my ($self, $r, $transaction) = @_; $transaction ||= $self->qp->transaction; - # X-DSPAM-Result: user@example.com; result="Spam"; class="Spam"; probability=1.0000; confidence=1.00; signature=N/A - # X-DSPAM-Result: smtpd; result="Innocent"; class="Innocent"; probability=0.0023; confidence=1.00; signature=4f8dae6a446008399211546 - my ($result,$prob,$conf,$sig) = $response =~ /result=\"(Spam|Innocent)\";.*?probability=([\d\.]+); confidence=([\d\.]+); signature=(.*)/; - my $header_str = "$result, probability=$prob, confidence=$conf"; + my $header_str = "$r->{result}, probability=$r->{probability}, confidence=$r->{confidence}"; $self->log(LOGDEBUG, $header_str); my $name = 'X-DSPAM-Result'; $transaction->header->delete($name) if $transaction->header->get($name); @@ -497,49 +479,108 @@ sub attach_headers { # the signature header is required if you intend to train dspam later. # In dspam.conf, set: Preference "signatureLocation=headers" - $transaction->header->add('X-DSPAM-Signature', $sig, 0); + $transaction->header->add('X-DSPAM-Signature', $r->{signature}, 0); }; -sub learn_as_ham { +sub train_error_as_ham { my $self = shift; my $transaction = shift; my $user = $self->select_username( $transaction ); my $dspam_bin = $self->{_args}{dspam_bin} || '/usr/local/bin/dspam'; - my $cmd = "$dspam_bin --user $user --mode=tum --source=corpus --class=innocent --deliver=summary --stdout"; + my $cmd = "$dspam_bin --user $user --mode=toe --source=error --class=innocent --deliver=summary --stdout"; $self->dspam_process( $cmd, $transaction ); }; -sub learn_as_spam { +sub train_error_as_spam { my $self = shift; my $transaction = shift; my $user = $self->select_username( $transaction ); my $dspam_bin = $self->{_args}{dspam_bin} || '/usr/local/bin/dspam'; - my $cmd = "$dspam_bin --user $user --mode=tum --source=corpus --class=spam --deliver=summary --stdout"; + my $cmd = "$dspam_bin --user $user --mode=toe --source=error --class=spam --deliver=summary --stdout"; $self->dspam_process( $cmd, $transaction ); }; sub autolearn { - my ( $self, $transaction ) = @_; + my ( $self, $response, $transaction ) = @_; + + defined $self->{_args}{autolearn} or return; + + $self->autolearn_naughty( $response, $transaction ) and return; + $self->autolearn_karma( $response, $transaction ) and return; + $self->autolearn_spamassassin( $response, $transaction ) and return; +}; + +sub autolearn_naughty { + my ( $self, $response, $transaction ) = @_; my $learn = $self->{_args}{autolearn} or return; - if ( $learn eq 'naughty' || $learn eq 'any' ) { - if ( $self->connection->notes('naughty') ) { - $self->log(LOGINFO, "training naughty as spam"); - $self->learn_as_spam( $transaction ); - }; - }; - if ( $learn eq 'karma' || $learn eq 'any' ) { - my $karma = $self->connection->notes('karma'); - if ( defined $karma && $karma <= -1 ) { - $self->log(LOGINFO, "training poor karma as spam"); - $self->learn_as_spam( $transaction ); - }; - if ( defined $karma && $karma >= 1 ) { - $self->log(LOGINFO, "training good karma as ham"); - $self->learn_as_ham( $transaction ); - }; + return if ( $learn ne 'naughty' && $learn ne 'any' ); + + if ( $self->connection->notes('naughty') && $response->{result} eq 'Innocent' ) { + $self->log(LOGINFO, "training naughty FN message as spam"); + $self->train_error_as_spam( $transaction ); + return 1; }; + + return; +}; + +sub autolearn_karma { + my ( $self, $response, $transaction ) = @_; + + my $learn = $self->{_args}{autolearn} or return; + + return if ( $learn ne 'karma' && $learn ne 'any' ); + + my $karma = $self->connection->notes('karma'); + return if ! defined $karma; + + if ( $karma <= -1 && $response->{result} eq 'Innocent' ) { + $self->log(LOGINFO, "training bad karma FN as spam"); + $self->train_error_as_spam( $transaction ); + return 1; + }; + + if ( $karma >= 1 && $response->{result} eq 'Spam' ) { + $self->log(LOGINFO, "training good karma FP as ham"); + $self->train_error_as_ham( $transaction ); + return 1; + }; + + return; +}; + +sub autolearn_spamassassin { + my ( $self, $response, $transaction ) = @_; + + my $learn = $self->{_args}{autolearn} or return; + + return if ( $learn ne 'spamassassin' && $learn ne 'any' ); + + my $sa = $transaction->notes('spamassassin' ); + if ( ! $sa || ! $sa->{is_spam} ) { + $self->log(LOGERROR, "SA results missing"); + return; + }; + + if ( ! $sa->{autolearn} ) { + $self->log(LOGERROR, "SA autolearn unset"); + return; + }; + + if ( $sa->{is_spam} eq 'Yes' && $sa->{autolearn} eq 'spam' && $response->{result} eq 'Innocent' ) { + $self->log(LOGINFO, "training spamassassin FN as spam"); + $self->train_error_as_spam( $transaction ); + return 1; + } + elsif ( $sa->{is_spam} eq 'No' && $sa->{autolearn} eq 'ham' && $response->{result} eq 'Spam' ) { + $self->log(LOGINFO, "training spamassassin FP as ham"); + $self->train_error_as_ham( $transaction ); + return 1; + }; + + return; }; diff --git a/t/plugin_tests/dspam b/t/plugin_tests/dspam index 5f104f1..4752ec8 100644 --- a/t/plugin_tests/dspam +++ b/t/plugin_tests/dspam @@ -11,7 +11,6 @@ my $r; sub register_tests { my $self = shift; - $self->register_test('test_get_filter_cmd', 5); $self->register_test('test_get_dspam_results', 6); $self->register_test('test_log_and_return', 6); $self->register_test('test_reject_type', 3); @@ -83,36 +82,6 @@ sub test_get_dspam_results { }; }; -sub test_get_filter_cmd { - my $self = shift; - - my $transaction = $self->qp->transaction; - my $dspam = "/usr/local/bin/dspam"; - $self->{_args}{dspam_bin} = $dspam; - $self->{_args}{autolearn} = 'spamassassin'; - - foreach my $user ( qw/ smtpd matt@example.com / ) { - my $answer = "$dspam --user smtpd --mode=tum --process --deliver=summary --stdout"; - my $r = $self->get_filter_cmd($transaction, 'smtpd'); - cmp_ok( $r, 'eq', $answer, "$user" ); - }; - - $transaction->notes('spamassassin', { is_spam => 'No', autolearn => 'ham' } ); - my $r = $self->get_filter_cmd($transaction, 'smtpd'); - cmp_ok( $r, 'eq', "$dspam --user smtpd --mode=tum --source=corpus --class=innocent --deliver=summary --stdout", - "smtpd, ham" ); - - $transaction->notes('spamassassin', { is_spam => 'Yes', autolearn => 'spam', score => 110 } ); - $r = $self->get_filter_cmd($transaction, 'smtpd'); - cmp_ok( $r, 'eq', "$dspam --user smtpd --mode=tum --source=corpus --class=spam --deliver=summary --stdout", - "smtpd, spam" ); - - $transaction->notes('spamassassin', { is_spam => 'No', autolearn => 'spam' } ); - $r = $self->get_filter_cmd($transaction, 'smtpd'); - cmp_ok( $r, 'eq', "$dspam --user smtpd --mode=tum --process --deliver=summary --stdout", - "smtpd, spam" ); -}; - sub test_reject_type { my $self = shift;