dspam: changed to only train on error

per suggestions by the dspam author
2012-06-22 23:44:55 -04:00 · 2012-06-22 23:44:55 -04:00 · 3db3565144
commit 3db3565144
parent 01c994439b
2 changed files with 119 additions and 109 deletions
--- a/plugins/dspam
+++ b/plugins/dspam
@ -18,13 +18,13 @@ contain a probability and confidence rating.

 =head1 TRAINING DSPAM

-Do not just enable dspam! Its false positive rate when untrained is high. The
-good news is; dspam learns very, very fast.
+If you enable dspam rejection without training first, you will lose valid
+mail. The dspam false positive rate is high when untrained. The good news is;
+dspam learns very, very fast.

-To get dspam into a useful state, it must be trained. The best method way to
-train dspam is to feed it two large equal sized corpuses of spam and ham from
-your mail server. The dspam authors suggest avoiding public corpuses. I train
-dspam as follows:
+The best method way to train dspam is to feed it two large equal sized
+corpuses of spam and ham from your mail server. The dspam authors suggest
+avoiding public corpuses. I train dspam as follows:

 =over 4

@ -70,7 +70,7 @@ learn messages with negative karma as spam (see plugins/karma)

 =item spamassassin

-learn from spamassassins messages with autolearn=(ham|spam)
+learn from spamassassins messages with autolearn=(ham|spam). See SPAMASSASSIN.

 =item any

@ -135,7 +135,7 @@ after delivery (ie, users moving messages to/from spam folders), then the
 dspam signature must be in the headers.

 When using the dspam MySQL backend, use InnoDB tables. DSPAM training
-is dramatically slowed by MyISAM table locks and dspam requires lots
+is dramatically slowed by MyISAM table locks and dspam requires a lot
 of training. InnoDB has row level locking and updates are much faster.

 =head1 DSPAM periodic maintenance
@ -144,8 +144,6 @@ Install this cron job to clean up your DSPAM database.

 http://dspam.git.sourceforge.net/git/gitweb.cgi?p=dspam/dspam;a=tree;f=contrib/dspam_maintenance;hb=HEAD

-
-
 =head1 SPAMASSASSIN

 DSPAM can be trained by SpamAssassin. This relationship between them requires
@ -164,13 +162,14 @@ reduce the SA load.

 =item 2

-Autolearn must be enabled and configured in SpamAssassin. SA autolearn will
+For I<autolearn spamassassin> to work, autolearn must be enabled and
+configured in SpamAssassin. SA autolearn will
 determine if a message is learned by dspam. The settings to pay careful
 attention to in your SA local.cf file are I<bayes_auto_learn_threshold_spam>
 and I<bayes_auto_learn_threshold_nonspam>. Make sure they are set to
 conservative values that will yield no false positives.

-If you are using I<autolearn spamassassin> and reject, messages that exceed
+If you are using I<autolearn spamassassin> and I<reject>, messages that exceed
 the SA threshholds will cause dspam to reject them. Again I say, make sure
 the SA autolearn threshholds are set high enough to avoid false positives.

@ -207,7 +206,7 @@ use IO::Handle;
 use Socket qw(:DEFAULT :crlf);

 sub register {
-    my ($self, $qp) = shift, shift;
+    my ($self, $qp) = (shift, shift);

    $self->log(LOGERROR, "Bad parameters for the dspam plugin") if @_ % 2;

@ -222,7 +221,6 @@ sub data_post_handler {
    my $self = shift;
    my $transaction = shift || $self->qp->transaction;

-    $self->autolearn( $transaction );
    return (DECLINED) if $self->is_immune();

    if ( $transaction->data_size > 500_000 ) {
@ -231,16 +229,18 @@ sub data_post_handler {
    };

    my $username = $self->select_username( $transaction );
-    my $filtercmd = $self->get_filter_cmd( $transaction, $username );
+    my $dspam_bin = $self->{_args}{dspam_bin} || '/usr/local/bin/dspam';
+    my $filtercmd = "$dspam_bin --user $username --mode=tum --process --deliver=summary --stdout";
    $self->log(LOGDEBUG, $filtercmd);

    my $response = $self->dspam_process( $filtercmd, $transaction );
-    if ( ! $response ) {
+    if ( ! $response->{result} ) {
        $self->log(LOGWARN, "skip, no dspam response. Check logs for errors.");
        return (DECLINED);
    };

    $self->attach_headers( $response, $transaction );
+    $self->autolearn( $response, $transaction );

    return $self->log_and_return( $transaction );
 };
@ -279,8 +279,26 @@ sub assemble_message {
 sub dspam_process {
    my ( $self, $filtercmd, $transaction ) = @_;

-    return $self->dspam_process_backticks( $filtercmd );
-    #return $self->dspam_process_open2( $filtercmd, $transaction );
+    my $dspam_response = $self->dspam_process_backticks( $filtercmd );
+    #my $dspam_response = $self->dspam_process_open2( $filtercmd, $transaction );
+    #my $dspam_response = $self->dspam_process_fork( $filtercmd );
+
+    # X-DSPAM-Result: user@example.com; result="Spam"; class="Spam"; probability=1.0000; confidence=1.00; signature=N/A
+    # X-DSPAM-Result: smtpd; result="Innocent"; class="Innocent"; probability=0.0023; confidence=1.00; signature=4f8dae6a446008399211546
+    my ($r, $p, $c, $s)
+        = $dspam_response
+            =~ /result=\"(Spam|Innocent)\";.*?probability=([\d\.]+); confidence=([\d\.]+); signature=(.*)/;
+
+    return {
+        result      => $r,
+        probability => $p,
+        confidence  => $c,
+        signature   => $s,
+    };
+};
+
+sub dspam_process_fork {
+    my ( $self, $filtercmd, $transaction ) = @_;

    # yucky. This method (which forks) exercises a bug in qpsmtpd. When the
    # child exits, the Transaction::DESTROY method is called, which deletes
@ -305,7 +323,6 @@ sub dspam_process_backticks {
    my ( $self, $filtercmd ) = @_;

    my $filename = $self->qp->transaction->body_filename;
-    #my $response = `cat $filename | $filtercmd`; chomp $response;
    my $response = `$filtercmd < $filename`; chomp $response;
    $self->log(LOGDEBUG, $response);
    return $response;
@ -450,46 +467,11 @@ sub get_dspam_results {
    return \%d;
 };

-sub get_filter_cmd {
-    my ($self, $transaction, $user) = @_;
-
-    my $dspam_bin = $self->{_args}{dspam_bin} || '/usr/local/bin/dspam';
-    my $default = "$dspam_bin --user $user --mode=tum --process --deliver=summary --stdout";
-
-    my $learn = $self->{_args}{autolearn} or return $default;
-    return $default if ( $learn ne 'spamassassin' && $learn ne 'any' );
-
-    $self->log(LOGDEBUG, "attempting to learn from SA");
-
-    my $sa = $transaction->notes('spamassassin' );
-    if ( ! $sa || ! $sa->{is_spam} ) {
-        $self->log(LOGERROR, "SA results missing");
-        return $default;
-    };
-
-    if ( ! $sa->{autolearn} ) {
-        $self->log(LOGERROR, "SA autolearn unset");
-        return $default;
-    };
-
-    if ( $sa->{is_spam} eq 'Yes' && $sa->{autolearn} eq 'spam' ) {
-        return "$dspam_bin --user $user --mode=tum --source=corpus --class=spam --deliver=summary --stdout";
-    }
-    elsif ( $sa->{is_spam} eq 'No' && $sa->{autolearn} eq 'ham' ) {
-        return "$dspam_bin --user $user --mode=tum --source=corpus --class=innocent --deliver=summary --stdout";
-    };
-
-    return $default;
-};
-
 sub attach_headers {
-    my ($self, $response, $transaction) = @_;
+    my ($self, $r, $transaction) = @_;
    $transaction ||= $self->qp->transaction;

-    # X-DSPAM-Result: user@example.com; result="Spam"; class="Spam"; probability=1.0000; confidence=1.00; signature=N/A
-    # X-DSPAM-Result: smtpd; result="Innocent"; class="Innocent"; probability=0.0023; confidence=1.00; signature=4f8dae6a446008399211546
-    my ($result,$prob,$conf,$sig) = $response =~ /result=\"(Spam|Innocent)\";.*?probability=([\d\.]+); confidence=([\d\.]+); signature=(.*)/;
-    my $header_str = "$result, probability=$prob, confidence=$conf";
+    my $header_str = "$r->{result}, probability=$r->{probability}, confidence=$r->{confidence}";
    $self->log(LOGDEBUG, $header_str);
    my $name = 'X-DSPAM-Result';
    $transaction->header->delete($name) if $transaction->header->get($name);
@ -497,49 +479,108 @@ sub attach_headers {

    # the signature header is required if you intend to train dspam later.
    # In dspam.conf, set: Preference "signatureLocation=headers"
-    $transaction->header->add('X-DSPAM-Signature', $sig, 0);
+    $transaction->header->add('X-DSPAM-Signature', $r->{signature}, 0);
 };

-sub learn_as_ham {
+sub train_error_as_ham {
    my $self = shift;
    my $transaction = shift;

    my $user = $self->select_username( $transaction );
    my $dspam_bin = $self->{_args}{dspam_bin} || '/usr/local/bin/dspam';
-    my $cmd = "$dspam_bin --user $user --mode=tum --source=corpus --class=innocent --deliver=summary --stdout";
+    my $cmd = "$dspam_bin --user $user --mode=toe --source=error --class=innocent --deliver=summary --stdout";
    $self->dspam_process( $cmd, $transaction );
 };

-sub learn_as_spam {
+sub train_error_as_spam {
    my $self = shift;
    my $transaction = shift;

    my $user = $self->select_username( $transaction );
    my $dspam_bin = $self->{_args}{dspam_bin} || '/usr/local/bin/dspam';
-    my $cmd = "$dspam_bin --user $user --mode=tum --source=corpus --class=spam --deliver=summary --stdout";
+    my $cmd = "$dspam_bin --user $user --mode=toe --source=error --class=spam --deliver=summary --stdout";
    $self->dspam_process( $cmd, $transaction );
 };

 sub autolearn {
-    my ( $self, $transaction ) = @_;
+    my ( $self, $response, $transaction ) = @_;
+
+    defined $self->{_args}{autolearn} or return;
+
+    $self->autolearn_naughty( $response, $transaction ) and return;
+    $self->autolearn_karma( $response, $transaction ) and return;
+    $self->autolearn_spamassassin( $response, $transaction ) and return;
+};
+
+sub autolearn_naughty {
+    my ( $self, $response, $transaction ) = @_;

    my $learn = $self->{_args}{autolearn} or return;

-    if ( $learn eq 'naughty' || $learn eq 'any' ) {
-        if ( $self->connection->notes('naughty') ) {
-            $self->log(LOGINFO, "training naughty as spam");
-            $self->learn_as_spam( $transaction );
-        };
-    };
-    if ( $learn eq 'karma' || $learn eq 'any' ) {
-        my $karma = $self->connection->notes('karma');
-        if ( defined $karma && $karma <= -1 ) {
-            $self->log(LOGINFO, "training poor karma as spam");
-            $self->learn_as_spam( $transaction );
-        };
-        if ( defined $karma && $karma >= 1 ) {
-            $self->log(LOGINFO, "training good karma as ham");
-            $self->learn_as_ham( $transaction );
-        };
+    return if ( $learn ne 'naughty' && $learn ne 'any' );
+
+    if ( $self->connection->notes('naughty') && $response->{result} eq 'Innocent' ) {
+        $self->log(LOGINFO, "training naughty FN message as spam");
+        $self->train_error_as_spam( $transaction );
+        return 1;
    };
+
+    return;
+};
+
+sub autolearn_karma {
+    my ( $self, $response, $transaction ) = @_;
+
+    my $learn = $self->{_args}{autolearn} or return;
+
+    return if ( $learn ne 'karma' && $learn ne 'any' );
+
+    my $karma = $self->connection->notes('karma');
+    return if ! defined $karma;
+
+    if ( $karma <= -1  && $response->{result} eq 'Innocent' ) {
+        $self->log(LOGINFO, "training bad karma FN as spam");
+        $self->train_error_as_spam( $transaction );
+        return 1;
+    };
+
+    if ( $karma >= 1 && $response->{result} eq 'Spam' ) {
+        $self->log(LOGINFO, "training good karma FP as ham");
+        $self->train_error_as_ham( $transaction );
+        return 1;
+    };
+
+    return;
+};
+
+sub autolearn_spamassassin {
+    my ( $self, $response, $transaction ) = @_;
+
+    my $learn = $self->{_args}{autolearn} or return;
+
+    return if ( $learn ne 'spamassassin' && $learn ne 'any' );
+
+    my $sa = $transaction->notes('spamassassin' );
+    if ( ! $sa || ! $sa->{is_spam} ) {
+        $self->log(LOGERROR, "SA results missing");
+        return;
+    };
+
+    if ( ! $sa->{autolearn} ) {
+        $self->log(LOGERROR, "SA autolearn unset");
+        return;
+    };
+
+    if ( $sa->{is_spam} eq 'Yes' && $sa->{autolearn} eq 'spam' && $response->{result} eq 'Innocent' ) {
+        $self->log(LOGINFO, "training spamassassin FN as spam");
+        $self->train_error_as_spam( $transaction );
+        return 1;
+    }
+    elsif ( $sa->{is_spam} eq 'No' && $sa->{autolearn} eq 'ham' && $response->{result} eq 'Spam' ) {
+        $self->log(LOGINFO, "training spamassassin FP as ham");
+        $self->train_error_as_ham( $transaction );
+        return 1;
+    };
+
+    return;
 };
--- a/t/plugin_tests/dspam
+++ b/t/plugin_tests/dspam
@ -11,7 +11,6 @@ my $r;
 sub register_tests {
    my $self = shift;

-    $self->register_test('test_get_filter_cmd', 5);
    $self->register_test('test_get_dspam_results', 6);
    $self->register_test('test_log_and_return', 6);
    $self->register_test('test_reject_type', 3);
@ -83,36 +82,6 @@ sub test_get_dspam_results {
    };
 };

-sub test_get_filter_cmd {
-    my $self = shift;
-
-    my $transaction = $self->qp->transaction;
-    my $dspam = "/usr/local/bin/dspam";
-    $self->{_args}{dspam_bin} = $dspam;
-    $self->{_args}{autolearn} = 'spamassassin';
-
-    foreach my $user ( qw/ smtpd matt@example.com / ) {
-        my $answer = "$dspam --user smtpd --mode=tum --process --deliver=summary --stdout";
-        my $r = $self->get_filter_cmd($transaction, 'smtpd');
-        cmp_ok( $r, 'eq', $answer, "$user" );
-    };
-
-    $transaction->notes('spamassassin', { is_spam => 'No', autolearn => 'ham' } );
-    my $r = $self->get_filter_cmd($transaction, 'smtpd');
-    cmp_ok( $r, 'eq', "$dspam --user smtpd --mode=tum --source=corpus --class=innocent --deliver=summary --stdout",
-        "smtpd, ham" );
-
-    $transaction->notes('spamassassin', { is_spam => 'Yes', autolearn => 'spam', score => 110 } );
-    $r = $self->get_filter_cmd($transaction, 'smtpd');
-    cmp_ok( $r, 'eq', "$dspam --user smtpd --mode=tum --source=corpus --class=spam --deliver=summary --stdout",
-        "smtpd, spam" );
-
-    $transaction->notes('spamassassin', { is_spam => 'No', autolearn => 'spam' } );
-    $r = $self->get_filter_cmd($transaction, 'smtpd');
-    cmp_ok( $r, 'eq', "$dspam --user smtpd --mode=tum --process --deliver=summary --stdout",
-        "smtpd, spam" );
-};
-
 sub test_reject_type {
    my $self = shift;