From 52256d2d9b2c3dcf181a89a4acd636e671c029b4 Mon Sep 17 00:00:00 2001
From: Matt Simerson <matt@tnpi.net>
Date: Mon, 25 Jun 2012 02:51:36 -0400
Subject: [PATCH] dspam: fixes for training dspam

process_backticks now writes the entire message (headers + body) to a temp file and had dspam read that. Previously, dspam only read the body.  With the new "process, then train on error" method, dspam didn't have access to the DSPAM signature (in the headers).

replaced open2 with open3. Same results. Works part of the time, but not consistent, and I haven't been able to figure out why.

dspam transaction note is now a hashref (was a string)
parsing of dspam response via substring (was regexp)
---
 plugins/dspam | 152 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 115 insertions(+), 37 deletions(-)

diff --git a/plugins/dspam b/plugins/dspam
index d80551b..a71ee9b 100644
--- a/plugins/dspam
+++ b/plugins/dspam
@@ -235,10 +235,12 @@ sub data_post_handler {
 
     my $response = $self->dspam_process( $filtercmd, $transaction );
     if ( ! $response->{result} ) {
-        $self->log(LOGWARN, "skip, no dspam response. Check logs for errors.");
+        $self->log(LOGWARN, "error, no dspam response. Check logs for errors.");
         return (DECLINED);
     };
 
+    $transaction->notes('dspam', $response);
+
     $self->attach_headers( $response, $transaction );
     $self->autolearn( $response, $transaction );
 
@@ -264,37 +266,78 @@ sub select_username {
 sub assemble_message {
     my ($self, $transaction) = @_;
 
-    $transaction->body_resetpos;
-
     my $message = "X-Envelope-From: "
         . $transaction->sender->format . "\n"
         . $transaction->header->as_string . "\n\n";
 
+    $transaction->body_resetpos;
     while (my $line = $transaction->body_getline) { $message .= $line; };
 
     $message = join(CRLF, split/\n/, $message);
     return $message . CRLF;
 };
 
+sub parse_response {
+    my $self = shift;
+    my $response = shift or do {
+        $self->log( LOGDEBUG, "missing dspam response!" );
+        return;
+    };
+
+# example DSPAM results:
+# user@example.com; result="Spam"; class="Spam"; probability=1.0000; confidence=1.00; signature=N/A
+# smtpd; result="Innocent"; class="Innocent"; probability=0.0023; confidence=1.00; signature=4f8dae6a446008399211546
+
+    #return $self->parse_response_regexp( $response );  # probably slower
+
+    my ($user, $result, $class, $prob, $conf, $sig) = split '; ', $response;
+
+    (undef, $result) = split '=', $result;
+    (undef, $class ) = split '=', $class;
+    (undef, $prob  ) = split '=', $prob;
+    (undef, $conf  ) = split '=', $conf;
+    (undef, $sig   ) = split '=', $sig;
+
+    $result = substr($result, 1, -1);  # strip off quotes
+    $class  = substr($class,  1, -1);
+
+    return {
+        class       => $class,
+        result      => $result,
+        probability => $prob,
+        confidence  => $conf,
+        signature   => $sig,
+    };
+};
+
+sub parse_response_regexp {
+    my ($self, $response) = @_;
+
+    my ($result, $class, $prob, $conf, $sig) = $response =~ /
+                result=\"(Spam|Innocent)\";\s
+                class=\"(Spam|Innocent)\";\s
+                probability=([\d\.]+);\s
+                confidence=([\d\.]+);\s
+                signature=(.*)
+            /x;
+
+    return {
+        class       => $class,
+        result      => $result,
+        probability => $prob,
+        confidence  => $conf,
+        signature   => $sig,
+    };
+};
+
 sub dspam_process {
     my ( $self, $filtercmd, $transaction ) = @_;
 
-    my $dspam_response = $self->dspam_process_backticks( $filtercmd );
-    #my $dspam_response = $self->dspam_process_open2( $filtercmd, $transaction );
-    #my $dspam_response = $self->dspam_process_fork( $filtercmd );
+    my $response = $self->dspam_process_backticks( $filtercmd );
+    #my $response = $self->dspam_process_open2( $filtercmd, $transaction );
+    #my $response = $self->dspam_process_fork( $filtercmd );
 
-    # X-DSPAM-Result: user@example.com; result="Spam"; class="Spam"; probability=1.0000; confidence=1.00; signature=N/A
-    # X-DSPAM-Result: smtpd; result="Innocent"; class="Innocent"; probability=0.0023; confidence=1.00; signature=4f8dae6a446008399211546
-    my ($r, $p, $c, $s)
-        = $dspam_response
-            =~ /result=\"(Spam|Innocent)\";.*?probability=([\d\.]+); confidence=([\d\.]+); signature=(.*)/;
-
-    return {
-        result      => $r,
-        probability => $p,
-        confidence  => $c,
-        signature   => $s,
-    };
+    return $self->parse_response( $response );
 };
 
 sub dspam_process_fork {
@@ -322,10 +365,22 @@ sub dspam_process_fork {
 sub dspam_process_backticks {
     my ( $self, $filtercmd ) = @_;
 
-    my $filename = $self->qp->transaction->body_filename;
-    my $response = `$filtercmd < $filename`; chomp $response;
-    $self->log(LOGDEBUG, $response);
-    return $response;
+    my $transaction = $self->qp->transaction;
+
+    my $message = $self->temp_file();
+    open my $fh, '>', $message;
+    print $fh "X-Envelope-From: "
+        . $transaction->sender->format . CRLF
+        . $transaction->header->as_string . CRLF . CRLF;
+
+    $transaction->body_resetpos;
+    while (my $line = $transaction->body_getline) { print $fh $line; };
+
+    close $fh;
+
+    my ($line1) = split /[\r|\n]/, `$filtercmd < $message`;
+    $self->log(LOGDEBUG, $line1);
+    return $line1;
 };
 
 sub dspam_process_open2 {
@@ -336,16 +391,25 @@ sub dspam_process_open2 {
 # not sure why, but this is not as reliable as I'd like. What's a dspam
 # error -5 mean anyway?
     use FileHandle;
-    use IPC::Open2;
-    my ($dspam_in, $dspam_out);
-    my $pid = open2($dspam_out, $dspam_in, $filtercmd);
-    print $dspam_in $message;
-    close $dspam_in;
+    use IPC::Open3;
+    my ($read, $write, $err);
+    use Symbol 'gensym'; $err = gensym;
+    my $pid = open3($write, $read, $err, $filtercmd);
+    print $write $message;
+    close $write;
     #my $response = join('', <$dspam_out>);  # get full response
-    my $response = <$dspam_out>;             # get first line only
+    my $response = <$read>;             # get first line only
     waitpid $pid, 0;
-    chomp $response;
-    $self->log(LOGDEBUG, $response);
+    my $child_exit_status = $? >> 8;
+    #$self->log(LOGINFO, "exit status: $child_exit_status");
+    if ( $response ) {
+        chomp $response;
+        $self->log(LOGDEBUG, $response);
+    };
+    my $err_msg = <$err>;
+    if ( $err_msg ) {
+        $self->log(LOGDEBUG, $err_msg );
+    };
     return $response;
 };
 
@@ -367,7 +431,7 @@ sub log_and_return {
     };
 
     if ( $reject eq 'agree' ) {
-        return $self->reject_agree( $transaction, $d );
+        return $self->reject_agree( $transaction );
     };
 
     if ( $d->{class} eq 'Innocent' ) {
@@ -394,9 +458,10 @@ sub log_and_return {
 }
 
 sub reject_agree {
-    my ($self, $transaction, $d ) = @_;
+    my ($self, $transaction ) = @_;
 
     my $sa = $transaction->notes('spamassassin' );
+    my $d  = $transaction->notes('dspam' );
 
     my $status = "$d->{class}, $d->{confidence} c";
 
@@ -423,13 +488,14 @@ sub reject_agree {
         if ( $sa->{is_spam} eq 'No' ) {
             if ( $d->{confidence} > .9 ) {
                 if ( defined $self->connection->notes('karma') ) {
-                    $self->connection->notes('karma', $self->connection->notes('karma') + 2);
+                    $self->connection->notes('karma', ( $self->connection->notes('karma') + 2) );
                 };
             };
             $self->log(LOGINFO, "pass, agree, $status");
             return DECLINED;
         };
         $self->log(LOGINFO, "pass, disagree, $status");
+        return DECLINED;
     };
 
     $self->log(LOGINFO, "pass, other $status");
@@ -489,7 +555,13 @@ sub train_error_as_ham {
     my $user = $self->select_username( $transaction );
     my $dspam_bin = $self->{_args}{dspam_bin} || '/usr/local/bin/dspam';
     my $cmd = "$dspam_bin --user $user --mode=toe --source=error --class=innocent --deliver=summary --stdout";
-    $self->dspam_process( $cmd, $transaction );
+    my $response = $self->dspam_process( $cmd, $transaction );
+    if ( $response ) {
+        $transaction->notes('dspam', $response);
+    }
+    else {
+        $transaction->notes('dspam', { class => 'Innocent', result => 'Innocent', confidence=>1 } );
+    };
 };
 
 sub train_error_as_spam {
@@ -499,7 +571,13 @@ sub train_error_as_spam {
     my $user = $self->select_username( $transaction );
     my $dspam_bin = $self->{_args}{dspam_bin} || '/usr/local/bin/dspam';
     my $cmd = "$dspam_bin --user $user --mode=toe --source=error --class=spam --deliver=summary --stdout";
-    $self->dspam_process( $cmd, $transaction );
+    my $response = $self->dspam_process( $cmd, $transaction );
+    if ( $response ) {
+        $transaction->notes('dspam', $response);
+    }
+    else {
+        $transaction->notes('dspam', { class => 'Spam', result => 'Spam', confidence=>1 } );
+    };
 };
 
 sub autolearn {
@@ -572,12 +650,12 @@ sub autolearn_spamassassin {
     };
 
     if ( $sa->{is_spam} eq 'Yes' && $sa->{autolearn} eq 'spam' && $response->{result} eq 'Innocent' ) {
-        $self->log(LOGINFO, "training spamassassin FN as spam");
+        $self->log(LOGINFO, "training SA FN as spam");
         $self->train_error_as_spam( $transaction );
         return 1;
     }
     elsif ( $sa->{is_spam} eq 'No' && $sa->{autolearn} eq 'ham' && $response->{result} eq 'Spam' ) {
-        $self->log(LOGINFO, "training spamassassin FP as ham");
+        $self->log(LOGINFO, "training SA FP as ham");
         $self->train_error_as_ham( $transaction );
         return 1;
     };