From 102e0682978338aefeb02343f75ed83104f17d91 Mon Sep 17 00:00:00 2001
From: Matt Simerson <matt@tnpi.net>
Date: Fri, 20 Apr 2012 01:42:04 -0400
Subject: [PATCH] added dspam plugin

---
 config.sample/plugins |   2 +
 plugins/dspam         | 341 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 343 insertions(+)
 create mode 100644 plugins/dspam
diff --git a/config.sample/plugins b/config.sample/plugins
index 3ac4af1..451d749 100644
--- a/config.sample/plugins
+++ b/config.sample/plugins
@@ -57,6 +57,8 @@ spamassassin
 #
 #   spamassassin reject_threshold 20 munge_subject_threshold 10
 
+# dspam must run after spamassassin for the learn_from_sa feature to work
+dspam learn_from_sa 7 reject 1
 
 # run the clamav virus checking plugin
 # virus/clamav
diff --git a/plugins/dspam b/plugins/dspam
new file mode 100644
index 0000000..86f59f0
--- /dev/null
+++ b/plugins/dspam
@@ -0,0 +1,341 @@
+#!perl -Tw
+
+=head1 NAME
+
+dspam - dspam integration for qpsmtpd
+
+=head1 DESCRIPTION
+
+qpsmtpd plugin that uses dspam to classify messages. Can use SpamAssassin to 
+train dspam.
+
+Adds the X-DSPAM-Result and X-DSPAM-Signature headers to messages. The latter is essential for 
+training dspam and the former is useful to MDAs, MUAs, and humans.
+
+=head1 TRAINING DSPAM
+
+To get dspam into a useful state, it must be trained. The best method way to
+train dspam is to feed it two large equal sized corpuses of spam and ham from
+your mail server. The dspam authors suggest avoiding public corpuses. I do
+this as follows:
+
+=over 4
+
+=item learn from SpamAssassin
+
+See the docs on the learn_from_sa feature in the CONFIG section.
+
+=item daily training
+
+I have a script that crawls the contents of every users maildir each night.
+The script builds two lists of messages: ham and spam. 
+
+The spam message list consists of all read messages in folders named Spam
+that have changed since the last spam learning run (normally 1 day).
+
+The ham message list consists of read messages in any folder not named like
+Spam, Junk, Trash, or Deleted. This catches messages that users have read 
+and left in their inbox, filed away into subfolders, and 
+
+=item on-the-fly training
+
+=back
+
+
+
+=head1 CONFIG
+
+=over 4
+
+=item dspam_bin
+
+The path to the dspam binary. If yours is installed somewhere other 
+than /usr/local/bin/dspam, you'll need to set this.
+
+=item learn_from_sa
+
+Dspam can be trained by SpamAssassin. This relationship between them requires
+attention to several important details:
+
+=over 4
+
+=item 1
+
+dspam must be listed B<after> spamassassin in the config/plugins file. 
+Because SA runs first, I crank the SA reject_threshold up above 100 so that
+all spam messages will be used to train dspam.
+
+Once dspam is trained and errors are rare, I plan to run dspam first and
+reduce the SA load.
+
+=item 2
+
+Autolearn must be enabled and configured in SpamAssassin. SA autolearn
+preferences will determine whether a message is learned as spam or innocent
+by dspam. The settings to pay careful attention to in your SA local.cf file 
+are bayes_auto_learn_threshold_spam and bayes_auto_learn_threshold_nonspam.
+Make sure they are both set to conservative values that are certain to 
+yield no false positives.
+
+If you are using learn_from_sa and reject, then messages that exceed the SA
+threshholds will cause dspam to reject them. Again I say, make sure them SA
+autolearn threshholds are set high enough to avoid false positives.
+
+=item 3
+
+dspam must be configured and working properly. I have modified the following
+dspam values on my system: 
+
+=over 4
+
+=item mysql storage
+
+=item Trust smtpd
+
+=item TrainingMode tum
+
+=item Tokenizer osb
+
+=item Preference "trainingMode=TOE"
+
+=item Preference "spamAction=deliver"
+
+=item Preference "signatureLocation=headers"
+
+=item TrainPristine off
+
+=item ParseToHeaders off
+
+=back
+
+Of those changes, the most important is the signature location. This plugin
+only supports storing the signature in the headers. If you want to train dspam
+after delivery (ie, users moving messages to/from spam folders), then the
+dspam signature must be in the headers.
+
+=back
+
+=item reject
+
+Set to a floating point value between 0 and 1.00 where 0 is no confidence
+and 1.0 is 100% confidence.
+
+If dspam's confidence is greater than or equal to this threshold, the 
+message will be rejected.
+
+=back
+
+
+=head1 MULTIPLE RECIPIENT BEHAVIOR
+
+For messages with multiple recipients, the user that dspam is running as will
+be the dspam username.
+
+When messages have a single recipient, the recipient address is used as the
+dspam username. For dspam to trust qpsmtpd with modifying the username, you
+B<must> add the username that qpsmtpd is running to to the dspamd.conf file.
+
+ie, (Trust smtpd).
+
+=head1 CHANGES
+
+=cut
+
+use strict;
+
+use Qpsmtpd::Constants;
+use Qpsmtpd::DSN;
+use IO::Handle;
+use Socket qw(:DEFAULT :crlf);
+
+sub register {
+    my ($self, $qp, @args) = @_;
+
+    $self->log(LOGERROR, "Bad parameters for the dspam plugin") if @_ % 2;
+
+    %{$self->{_args}} = @args;
+
+    $self->register_hook('data_post', 'dspam_reject')
+        if $self->{_args}->{reject};
+}
+
+sub hook_data_post {
+    my ($self, $transaction) = @_;
+
+    $self->log(LOGDEBUG, "check_dspam");
+    return (DECLINED) if $transaction->data_size > 500_000;
+
+    my $username = $self->select_username( $transaction );
+    my $message  = $self->assemble_message($transaction);
+    my $filtercmd = $self->get_filter_cmd( $transaction, $username );
+    $self->log(LOGWARN, $filtercmd);
+
+    my $response = $self->dspam_process( $filtercmd, $message );
+    if ( ! $response ) {
+        $self->log(LOGWARN, "No response received from dspam. Check your logs for errors.");
+        return (DECLINED);
+    };
+    $self->log(LOGWARN, $response);
+
+    # X-DSPAM-Result: user@example.com; result="Spam"; class="Spam"; probability=1.0000; confidence=1.00; signature=N/A
+    # X-DSPAM-Result: smtpd; result="Innocent"; class="Innocent"; probability=0.0023; confidence=1.00; signature=4f8dae6a446008399211546
+    my ($result,$prob,$conf,$sig) = $response =~ /result=\"(Spam|Innocent)\";.*?probability=([\d\.]+); confidence=([\d\.]+); signature=(.*)/;
+    my $header_str = "$result, probability=$prob, confidence=$conf";
+    $self->log(LOGWARN, $header_str);
+    $transaction->header->add('X-DSPAM-Result', $header_str, 0);
+
+    # the signature header is required if you intend to train dspam later
+    # you must set Preference "signatureLocation=headers" in dspam.conf
+    $transaction->header->add('X-DSPAM-Signature', $sig, 0);
+
+    return (DECLINED);
+};
+
+sub select_username {
+    my ($self, $transaction) = @_;
+
+    my $recipient_count = scalar $transaction->recipients;
+    $self->log(LOGDEBUG, "Message has $recipient_count recipients");
+
+    if ( $recipient_count > 1 ) {
+        $self->log(LOGINFO, "skipping user prefs, $recipient_count recipients detected.");
+        return getpwuid($>);
+    };
+
+# use the recipients email address as username. This enables user prefs
+    my $username = ($transaction->recipients)[0]->address;
+    return lc($username);
+};
+
+sub assemble_message {
+    my ($self, $transaction) = @_;
+
+    $transaction->body_resetpos;
+
+    my $message = "X-Envelope-From: "
+        . $transaction->sender->format . "\n"
+        . $transaction->header->as_string . "\n\n";
+
+    while (my $line = $transaction->body_getline) { $message .= $line; };
+
+    $message = join(CRLF, split/\n/, $message);
+    return $message . CRLF;
+};
+
+sub dspam_process {
+    my ( $self, $filtercmd, $message ) = @_;
+
+    #return $self->dspam_process_open2( $filtercmd, $message );
+
+    my ($in_fh, $out_fh);
+    if (! open($in_fh, "-|")) {
+        open($out_fh, "|$filtercmd") or die "Can't run $filtercmd: $!\n";
+        print $out_fh $message;
+        close $out_fh;
+        exit(0);
+    };
+    my $response = join('', <$in_fh>);
+    close $in_fh;
+    chomp $response;
+
+    return $response;
+};
+
+sub dspam_process_open2 {
+    my ( $self, $filtercmd, $message ) = @_;
+
+# not sure why, but this is not as reliable as I'd like. What's a dspam 
+# error -5 mean anyway?
+    use FileHandle;
+    use IPC::Open2;
+    my ($dspam_in, $dspam_out);
+    my $pid = open2($dspam_out, $dspam_in, $filtercmd);
+    print $dspam_in $message;
+    close $dspam_in;
+    my $response = join('', <$dspam_out>);
+    waitpid $pid, 0;
+    chomp $response;
+    return $response;
+};
+
+sub dspam_reject {
+    my ($self, $transaction) = @_;
+
+    return (DECLINED) if ! $self->{_args}->{reject};
+
+    my $status = $transaction->header->get('X-DSPAM-Result') or do {
+        $self->log(LOGWARN, "dspam_reject: failed to find the dspam header");
+        return (DECLINED);
+    };
+    my ($clas,$probability,$confidence)  = $status =~ m/^(Spam|Innocent), probability=([\d\.]+), confidence=([\d\.]+)/i;
+
+    $self->log(LOGDEBUG, "dspam $clas, prob: $probability, conf: $confidence");
+
+    if ( $clas eq 'Spam' && $probability == 1 && $confidence == 1 ) {
+# default of media_unsupported is DENY, so just change the message
+        if ( $self->qp->connection->relay_client ) {
+            $self->log(LOGWARN, "allowing spam since user authenticated");
+            return DECLINED;
+        };
+        return Qpsmtpd::DSN->media_unsupported('dspam says, no spam please');
+    };
+
+    return DECLINED;
+}
+
+sub get_filter_cmd {
+    my ($self, $transaction, $user) = @_;
+
+    my $dspam_bin = $self->{_args}->{dspam_bin} || '/usr/local/bin/dspam';
+    my $default = "$dspam_bin --user $user --mode=tum --process --deliver=summary --stdout";
+    my $min_score = $self->{_args}->{learn_from_sa} or return $default;
+
+    #$self->log(LOGDEBUG, "attempting to learn from SA");
+    my $sa_status = $transaction->header->get('X-Spam-Status');
+
+    if ( ! $sa_status ) {
+        $self->log(LOGERROR, "dspam learn_from_sa was set but no X-Spam-Status header detected");
+        return $default;
+    };
+    chomp $sa_status;
+
+    my ($is_spam,$score,$autolearn) = $sa_status =~ /^(yes|no), score=([\d\.\-]+)\s.*?autolearn=([\w]+)/i;
+    $self->log(LOGINFO, "sa_status: $sa_status; $is_spam; $autolearn");
+
+    $is_spam = lc($is_spam); 
+    $autolearn = lc($autolearn);
+
+    if ( $is_spam eq 'yes' && $score < $min_score ) {
+        $self->log(LOGWARN, "SA spam score of $score is less than $min_score, skipping autolearn");
+        return $default;
+    };
+
+    if ( $is_spam eq 'yes' && $autolearn eq 'spam' ) {
+        return "$dspam_bin --user $user --mode=tum --source=corpus --class=spam --deliver=summary --stdout";
+    }
+    elsif ( $is_spam eq 'no' && $autolearn eq 'ham' ) {
+        return "$dspam_bin --user $user --mode=tum --source=corpus --class=innocent --deliver=summary --stdout";
+    };
+
+    return $default;
+};
+
+sub _cleanup_spam_header {
+    my ($self, $transaction, $header_name) = @_;
+
+    my $action = 'rename';
+    if ( $self->{_args}->{leave_old_headers} ) {
+        $action = lc($self->{_args}->{leave_old_headers});
+    };
+
+    return unless $action eq 'drop' || $action eq 'rename';
+
+    my $old_header_name = $header_name;
+    $old_header_name = ($old_header_name =~ s/^X-//) ? "X-Old-$old_header_name" : "Old-$old_header_name";
+
+    for my $header ( $transaction->header->get($header_name) ) {
+        $transaction->header->add($old_header_name, $header) if $action eq 'rename';
+        $transaction->header->delete($header_name);
+    }
+}
+