dspam: changed to only train on error

per suggestions by the dspam author
This commit is contained in:
Matt Simerson 2012-06-22 23:44:55 -04:00
parent 01c994439b
commit 3db3565144
2 changed files with 119 additions and 109 deletions

View File

@ -18,13 +18,13 @@ contain a probability and confidence rating.
=head1 TRAINING DSPAM =head1 TRAINING DSPAM
Do not just enable dspam! Its false positive rate when untrained is high. The If you enable dspam rejection without training first, you will lose valid
good news is; dspam learns very, very fast. mail. The dspam false positive rate is high when untrained. The good news is;
dspam learns very, very fast.
To get dspam into a useful state, it must be trained. The best method way to The best method way to train dspam is to feed it two large equal sized
train dspam is to feed it two large equal sized corpuses of spam and ham from corpuses of spam and ham from your mail server. The dspam authors suggest
your mail server. The dspam authors suggest avoiding public corpuses. I train avoiding public corpuses. I train dspam as follows:
dspam as follows:
=over 4 =over 4
@ -70,7 +70,7 @@ learn messages with negative karma as spam (see plugins/karma)
=item spamassassin =item spamassassin
learn from spamassassins messages with autolearn=(ham|spam) learn from spamassassins messages with autolearn=(ham|spam). See SPAMASSASSIN.
=item any =item any
@ -135,7 +135,7 @@ after delivery (ie, users moving messages to/from spam folders), then the
dspam signature must be in the headers. dspam signature must be in the headers.
When using the dspam MySQL backend, use InnoDB tables. DSPAM training When using the dspam MySQL backend, use InnoDB tables. DSPAM training
is dramatically slowed by MyISAM table locks and dspam requires lots is dramatically slowed by MyISAM table locks and dspam requires a lot
of training. InnoDB has row level locking and updates are much faster. of training. InnoDB has row level locking and updates are much faster.
=head1 DSPAM periodic maintenance =head1 DSPAM periodic maintenance
@ -144,8 +144,6 @@ Install this cron job to clean up your DSPAM database.
http://dspam.git.sourceforge.net/git/gitweb.cgi?p=dspam/dspam;a=tree;f=contrib/dspam_maintenance;hb=HEAD http://dspam.git.sourceforge.net/git/gitweb.cgi?p=dspam/dspam;a=tree;f=contrib/dspam_maintenance;hb=HEAD
=head1 SPAMASSASSIN =head1 SPAMASSASSIN
DSPAM can be trained by SpamAssassin. This relationship between them requires DSPAM can be trained by SpamAssassin. This relationship between them requires
@ -164,13 +162,14 @@ reduce the SA load.
=item 2 =item 2
Autolearn must be enabled and configured in SpamAssassin. SA autolearn will For I<autolearn spamassassin> to work, autolearn must be enabled and
configured in SpamAssassin. SA autolearn will
determine if a message is learned by dspam. The settings to pay careful determine if a message is learned by dspam. The settings to pay careful
attention to in your SA local.cf file are I<bayes_auto_learn_threshold_spam> attention to in your SA local.cf file are I<bayes_auto_learn_threshold_spam>
and I<bayes_auto_learn_threshold_nonspam>. Make sure they are set to and I<bayes_auto_learn_threshold_nonspam>. Make sure they are set to
conservative values that will yield no false positives. conservative values that will yield no false positives.
If you are using I<autolearn spamassassin> and reject, messages that exceed If you are using I<autolearn spamassassin> and I<reject>, messages that exceed
the SA threshholds will cause dspam to reject them. Again I say, make sure the SA threshholds will cause dspam to reject them. Again I say, make sure
the SA autolearn threshholds are set high enough to avoid false positives. the SA autolearn threshholds are set high enough to avoid false positives.
@ -207,7 +206,7 @@ use IO::Handle;
use Socket qw(:DEFAULT :crlf); use Socket qw(:DEFAULT :crlf);
sub register { sub register {
my ($self, $qp) = shift, shift; my ($self, $qp) = (shift, shift);
$self->log(LOGERROR, "Bad parameters for the dspam plugin") if @_ % 2; $self->log(LOGERROR, "Bad parameters for the dspam plugin") if @_ % 2;
@ -222,7 +221,6 @@ sub data_post_handler {
my $self = shift; my $self = shift;
my $transaction = shift || $self->qp->transaction; my $transaction = shift || $self->qp->transaction;
$self->autolearn( $transaction );
return (DECLINED) if $self->is_immune(); return (DECLINED) if $self->is_immune();
if ( $transaction->data_size > 500_000 ) { if ( $transaction->data_size > 500_000 ) {
@ -231,16 +229,18 @@ sub data_post_handler {
}; };
my $username = $self->select_username( $transaction ); my $username = $self->select_username( $transaction );
my $filtercmd = $self->get_filter_cmd( $transaction, $username ); my $dspam_bin = $self->{_args}{dspam_bin} || '/usr/local/bin/dspam';
my $filtercmd = "$dspam_bin --user $username --mode=tum --process --deliver=summary --stdout";
$self->log(LOGDEBUG, $filtercmd); $self->log(LOGDEBUG, $filtercmd);
my $response = $self->dspam_process( $filtercmd, $transaction ); my $response = $self->dspam_process( $filtercmd, $transaction );
if ( ! $response ) { if ( ! $response->{result} ) {
$self->log(LOGWARN, "skip, no dspam response. Check logs for errors."); $self->log(LOGWARN, "skip, no dspam response. Check logs for errors.");
return (DECLINED); return (DECLINED);
}; };
$self->attach_headers( $response, $transaction ); $self->attach_headers( $response, $transaction );
$self->autolearn( $response, $transaction );
return $self->log_and_return( $transaction ); return $self->log_and_return( $transaction );
}; };
@ -279,8 +279,26 @@ sub assemble_message {
sub dspam_process { sub dspam_process {
my ( $self, $filtercmd, $transaction ) = @_; my ( $self, $filtercmd, $transaction ) = @_;
return $self->dspam_process_backticks( $filtercmd ); my $dspam_response = $self->dspam_process_backticks( $filtercmd );
#return $self->dspam_process_open2( $filtercmd, $transaction ); #my $dspam_response = $self->dspam_process_open2( $filtercmd, $transaction );
#my $dspam_response = $self->dspam_process_fork( $filtercmd );
# X-DSPAM-Result: user@example.com; result="Spam"; class="Spam"; probability=1.0000; confidence=1.00; signature=N/A
# X-DSPAM-Result: smtpd; result="Innocent"; class="Innocent"; probability=0.0023; confidence=1.00; signature=4f8dae6a446008399211546
my ($r, $p, $c, $s)
= $dspam_response
=~ /result=\"(Spam|Innocent)\";.*?probability=([\d\.]+); confidence=([\d\.]+); signature=(.*)/;
return {
result => $r,
probability => $p,
confidence => $c,
signature => $s,
};
};
sub dspam_process_fork {
my ( $self, $filtercmd, $transaction ) = @_;
# yucky. This method (which forks) exercises a bug in qpsmtpd. When the # yucky. This method (which forks) exercises a bug in qpsmtpd. When the
# child exits, the Transaction::DESTROY method is called, which deletes # child exits, the Transaction::DESTROY method is called, which deletes
@ -305,7 +323,6 @@ sub dspam_process_backticks {
my ( $self, $filtercmd ) = @_; my ( $self, $filtercmd ) = @_;
my $filename = $self->qp->transaction->body_filename; my $filename = $self->qp->transaction->body_filename;
#my $response = `cat $filename | $filtercmd`; chomp $response;
my $response = `$filtercmd < $filename`; chomp $response; my $response = `$filtercmd < $filename`; chomp $response;
$self->log(LOGDEBUG, $response); $self->log(LOGDEBUG, $response);
return $response; return $response;
@ -450,46 +467,11 @@ sub get_dspam_results {
return \%d; return \%d;
}; };
sub get_filter_cmd {
my ($self, $transaction, $user) = @_;
my $dspam_bin = $self->{_args}{dspam_bin} || '/usr/local/bin/dspam';
my $default = "$dspam_bin --user $user --mode=tum --process --deliver=summary --stdout";
my $learn = $self->{_args}{autolearn} or return $default;
return $default if ( $learn ne 'spamassassin' && $learn ne 'any' );
$self->log(LOGDEBUG, "attempting to learn from SA");
my $sa = $transaction->notes('spamassassin' );
if ( ! $sa || ! $sa->{is_spam} ) {
$self->log(LOGERROR, "SA results missing");
return $default;
};
if ( ! $sa->{autolearn} ) {
$self->log(LOGERROR, "SA autolearn unset");
return $default;
};
if ( $sa->{is_spam} eq 'Yes' && $sa->{autolearn} eq 'spam' ) {
return "$dspam_bin --user $user --mode=tum --source=corpus --class=spam --deliver=summary --stdout";
}
elsif ( $sa->{is_spam} eq 'No' && $sa->{autolearn} eq 'ham' ) {
return "$dspam_bin --user $user --mode=tum --source=corpus --class=innocent --deliver=summary --stdout";
};
return $default;
};
sub attach_headers { sub attach_headers {
my ($self, $response, $transaction) = @_; my ($self, $r, $transaction) = @_;
$transaction ||= $self->qp->transaction; $transaction ||= $self->qp->transaction;
# X-DSPAM-Result: user@example.com; result="Spam"; class="Spam"; probability=1.0000; confidence=1.00; signature=N/A my $header_str = "$r->{result}, probability=$r->{probability}, confidence=$r->{confidence}";
# X-DSPAM-Result: smtpd; result="Innocent"; class="Innocent"; probability=0.0023; confidence=1.00; signature=4f8dae6a446008399211546
my ($result,$prob,$conf,$sig) = $response =~ /result=\"(Spam|Innocent)\";.*?probability=([\d\.]+); confidence=([\d\.]+); signature=(.*)/;
my $header_str = "$result, probability=$prob, confidence=$conf";
$self->log(LOGDEBUG, $header_str); $self->log(LOGDEBUG, $header_str);
my $name = 'X-DSPAM-Result'; my $name = 'X-DSPAM-Result';
$transaction->header->delete($name) if $transaction->header->get($name); $transaction->header->delete($name) if $transaction->header->get($name);
@ -497,49 +479,108 @@ sub attach_headers {
# the signature header is required if you intend to train dspam later. # the signature header is required if you intend to train dspam later.
# In dspam.conf, set: Preference "signatureLocation=headers" # In dspam.conf, set: Preference "signatureLocation=headers"
$transaction->header->add('X-DSPAM-Signature', $sig, 0); $transaction->header->add('X-DSPAM-Signature', $r->{signature}, 0);
}; };
sub learn_as_ham { sub train_error_as_ham {
my $self = shift; my $self = shift;
my $transaction = shift; my $transaction = shift;
my $user = $self->select_username( $transaction ); my $user = $self->select_username( $transaction );
my $dspam_bin = $self->{_args}{dspam_bin} || '/usr/local/bin/dspam'; my $dspam_bin = $self->{_args}{dspam_bin} || '/usr/local/bin/dspam';
my $cmd = "$dspam_bin --user $user --mode=tum --source=corpus --class=innocent --deliver=summary --stdout"; my $cmd = "$dspam_bin --user $user --mode=toe --source=error --class=innocent --deliver=summary --stdout";
$self->dspam_process( $cmd, $transaction ); $self->dspam_process( $cmd, $transaction );
}; };
sub learn_as_spam { sub train_error_as_spam {
my $self = shift; my $self = shift;
my $transaction = shift; my $transaction = shift;
my $user = $self->select_username( $transaction ); my $user = $self->select_username( $transaction );
my $dspam_bin = $self->{_args}{dspam_bin} || '/usr/local/bin/dspam'; my $dspam_bin = $self->{_args}{dspam_bin} || '/usr/local/bin/dspam';
my $cmd = "$dspam_bin --user $user --mode=tum --source=corpus --class=spam --deliver=summary --stdout"; my $cmd = "$dspam_bin --user $user --mode=toe --source=error --class=spam --deliver=summary --stdout";
$self->dspam_process( $cmd, $transaction ); $self->dspam_process( $cmd, $transaction );
}; };
sub autolearn { sub autolearn {
my ( $self, $transaction ) = @_; my ( $self, $response, $transaction ) = @_;
defined $self->{_args}{autolearn} or return;
$self->autolearn_naughty( $response, $transaction ) and return;
$self->autolearn_karma( $response, $transaction ) and return;
$self->autolearn_spamassassin( $response, $transaction ) and return;
};
sub autolearn_naughty {
my ( $self, $response, $transaction ) = @_;
my $learn = $self->{_args}{autolearn} or return; my $learn = $self->{_args}{autolearn} or return;
if ( $learn eq 'naughty' || $learn eq 'any' ) { return if ( $learn ne 'naughty' && $learn ne 'any' );
if ( $self->connection->notes('naughty') ) {
$self->log(LOGINFO, "training naughty as spam"); if ( $self->connection->notes('naughty') && $response->{result} eq 'Innocent' ) {
$self->learn_as_spam( $transaction ); $self->log(LOGINFO, "training naughty FN message as spam");
}; $self->train_error_as_spam( $transaction );
}; return 1;
if ( $learn eq 'karma' || $learn eq 'any' ) {
my $karma = $self->connection->notes('karma');
if ( defined $karma && $karma <= -1 ) {
$self->log(LOGINFO, "training poor karma as spam");
$self->learn_as_spam( $transaction );
};
if ( defined $karma && $karma >= 1 ) {
$self->log(LOGINFO, "training good karma as ham");
$self->learn_as_ham( $transaction );
};
}; };
return;
};
sub autolearn_karma {
my ( $self, $response, $transaction ) = @_;
my $learn = $self->{_args}{autolearn} or return;
return if ( $learn ne 'karma' && $learn ne 'any' );
my $karma = $self->connection->notes('karma');
return if ! defined $karma;
if ( $karma <= -1 && $response->{result} eq 'Innocent' ) {
$self->log(LOGINFO, "training bad karma FN as spam");
$self->train_error_as_spam( $transaction );
return 1;
};
if ( $karma >= 1 && $response->{result} eq 'Spam' ) {
$self->log(LOGINFO, "training good karma FP as ham");
$self->train_error_as_ham( $transaction );
return 1;
};
return;
};
sub autolearn_spamassassin {
my ( $self, $response, $transaction ) = @_;
my $learn = $self->{_args}{autolearn} or return;
return if ( $learn ne 'spamassassin' && $learn ne 'any' );
my $sa = $transaction->notes('spamassassin' );
if ( ! $sa || ! $sa->{is_spam} ) {
$self->log(LOGERROR, "SA results missing");
return;
};
if ( ! $sa->{autolearn} ) {
$self->log(LOGERROR, "SA autolearn unset");
return;
};
if ( $sa->{is_spam} eq 'Yes' && $sa->{autolearn} eq 'spam' && $response->{result} eq 'Innocent' ) {
$self->log(LOGINFO, "training spamassassin FN as spam");
$self->train_error_as_spam( $transaction );
return 1;
}
elsif ( $sa->{is_spam} eq 'No' && $sa->{autolearn} eq 'ham' && $response->{result} eq 'Spam' ) {
$self->log(LOGINFO, "training spamassassin FP as ham");
$self->train_error_as_ham( $transaction );
return 1;
};
return;
}; };

View File

@ -11,7 +11,6 @@ my $r;
sub register_tests { sub register_tests {
my $self = shift; my $self = shift;
$self->register_test('test_get_filter_cmd', 5);
$self->register_test('test_get_dspam_results', 6); $self->register_test('test_get_dspam_results', 6);
$self->register_test('test_log_and_return', 6); $self->register_test('test_log_and_return', 6);
$self->register_test('test_reject_type', 3); $self->register_test('test_reject_type', 3);
@ -83,36 +82,6 @@ sub test_get_dspam_results {
}; };
}; };
sub test_get_filter_cmd {
my $self = shift;
my $transaction = $self->qp->transaction;
my $dspam = "/usr/local/bin/dspam";
$self->{_args}{dspam_bin} = $dspam;
$self->{_args}{autolearn} = 'spamassassin';
foreach my $user ( qw/ smtpd matt@example.com / ) {
my $answer = "$dspam --user smtpd --mode=tum --process --deliver=summary --stdout";
my $r = $self->get_filter_cmd($transaction, 'smtpd');
cmp_ok( $r, 'eq', $answer, "$user" );
};
$transaction->notes('spamassassin', { is_spam => 'No', autolearn => 'ham' } );
my $r = $self->get_filter_cmd($transaction, 'smtpd');
cmp_ok( $r, 'eq', "$dspam --user smtpd --mode=tum --source=corpus --class=innocent --deliver=summary --stdout",
"smtpd, ham" );
$transaction->notes('spamassassin', { is_spam => 'Yes', autolearn => 'spam', score => 110 } );
$r = $self->get_filter_cmd($transaction, 'smtpd');
cmp_ok( $r, 'eq', "$dspam --user smtpd --mode=tum --source=corpus --class=spam --deliver=summary --stdout",
"smtpd, spam" );
$transaction->notes('spamassassin', { is_spam => 'No', autolearn => 'spam' } );
$r = $self->get_filter_cmd($transaction, 'smtpd');
cmp_ok( $r, 'eq', "$dspam --user smtpd --mode=tum --process --deliver=summary --stdout",
"smtpd, spam" );
};
sub test_reject_type { sub test_reject_type {
my $self = shift; my $self = shift;