spamassassin-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From j.@apache.org
Subject svn commit: rev 10336 - incubator/spamassassin/trunk/lib/Mail/SpamAssassin
Date Wed, 28 Apr 2004 02:06:24 GMT
Author: jm
Date: Tue Apr 27 19:06:23 2004
New Revision: 10336

Modified:
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
   incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
Log:
bug 2266: add 'add_header' tags to display information about the message's Bayesian classification
and the tokens used; patch contributed by David Koppelman

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Bayes.pm	Tue Apr 27 19:06:23 2004
@@ -1059,6 +1059,45 @@
   return $prob;
 }
 
+###########################################################################
+# If a token is neither hammy nor spammy, return 0.
+# For a spammy token, return the minimum number of additional ham messages
+# it would have had to appear in to no longer be spammy.  Hammy tokens
+# are handled similarly.  That's what the function does (at the time
+# of this writing, 31 July 2003, 16:02:55 CDT).  It would be slightly
+# more useful if it returned the number of /additional/ ham messages
+# a spammy token would have to appear in to no longer be spammy but I
+# fear that might require the solution to a cubic equation, and I
+# just don't have the time for that now.
+
+sub compute_declassification_distance {
+  my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_;
+
+  return 0 if $ns == 0 && $nn == 0;
+
+  if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);}
+  if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);}
+
+  return 0 if $Ns == 0 || $Nn == 0;
+  return 0 if abs( $prob - 0.5 ) < $self->{robinson_min_prob_strength};
+
+  my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn);
+  my $p = 0.5 - $self->{robinson_min_prob_strength};
+
+  return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na
+    unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS;
+
+  my $s = $self->{robinson_s_constant};
+  my $sx = $self->{robinson_s_times_x};
+  my $a = $Nb * ( 1 - $p );
+  my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb;
+  my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) );
+
+  return int( 1.0 - 1e-6 + ( -$b + sqrt( $b * $b - 4 * $a * $c ) ) / ( 2 * $a ) )
+    - $na;
+}
+
+
 # Check to make sure we can tie() the DB, and we have enough entries to do a scan
 sub is_scan_available {
   my $self = shift;
@@ -1105,15 +1144,18 @@
 
   my $msgdata = $self->get_msgdata_from_permsgstatus ($permsgstatus);
 
+  my $pw;
+  my @tokens = $self->tokenize ($msg, $msgdata);
+
   # Figure out our probabilities for the message tokens
   my %pw = map {
-      my $pw = $self->compute_prob_for_token ($_, $ns, $nn);
+      $pw = $self->compute_prob_for_token ($_, $ns, $nn);
       if (!defined $pw) {
 	();		# exit map()
       } else {
 	($_ => $pw);
       }
-  } $self->tokenize ($msg, $msgdata);
+  } @tokens;
 
   # If none of the tokens were found in the DB, we're going to skip
   # this message...
@@ -1122,6 +1164,9 @@
     goto skip;
   }
 
+  my $tcount_total = @tokens;
+  my $tcount_learned = keys %pw;
+
   # Figure out the message receive time (used as atime below)
   # If the message atime comes back as being in the future, something's
   # messed up and we should revert to current time as a safety measure.
@@ -1134,6 +1179,10 @@
   my $count = N_SIGNIFICANT_TOKENS;
   my @sorted = ();
 
+  my ($tcount_spammy,$tcount_hammy) = (0,0);
+  my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];
+  my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];
+
   for (sort {
               abs($pw{$b} - 0.5) <=> abs($pw{$a} - 0.5)
             } keys %pw)
@@ -1141,6 +1190,15 @@
     if ($count-- < 0) { last; }
     my $pw = $pw{$_};
     next if (abs($pw - 0.5) < $self->{robinson_min_prob_strength});
+
+    # What's more expensive, scanning headers for HAMMYTOKENS and
+    # SPAMMYTOKENS tags that aren't there or collecting data that
+    # won't be used?  Just collecting the data is certainly simpler.
+    #
+    my ($s, $n, $a) = $self->{store}->tok_get ($_);
+    push @$tinfo_spammy, [$_,$pw,$s,$n,$a] if $pw >= 0.5 && ++$tcount_spammy;
+    push @$tinfo_hammy,  [$_,$pw,$s,$n,$a] if $pw <  0.5 && ++$tcount_hammy;
+
     push (@sorted, $pw);
 
     # update the atime on this token, it proved useful
@@ -1167,6 +1225,9 @@
 
   dbg ("bayes: score = $score");
 
+  $permsgstatus->{bayes_nspam} = $ns;
+  $permsgstatus->{bayes_nham} = $nn;
+
   if ($self->{log_raw_counts}) {
     print "#Bayes-Raw-Counts: $self->{raw_counts}\n";
   }
@@ -1179,6 +1240,11 @@
   $self->{store}->cleanup();
   $self->opportunistic_calls();
   $self->{store}->untie_db();
+
+  $permsgstatus->{tag_data}{BAYESTCHAMMY} = $tcount_hammy;
+  $permsgstatus->{tag_data}{BAYESTCSPAMMY} = $tcount_spammy;
+  $permsgstatus->{tag_data}{BAYESTCLEARNED} = $tcount_learned;
+  $permsgstatus->{tag_data}{BAYESTC} = $tcount_total;
 
   return $score;
 }

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/Conf.pm	Tue Apr 27 19:06:23 2004
@@ -70,6 +70,13 @@
  _SUBVERSION_      sub-version/code revision date (eg. 2004-01-10)
  _HOSTNAME_        hostname
  _BAYES_           bayes score
+ _TOKENSUMMARY_    number of new, neutral, spammy, and hammy tokens found
+ _BAYESTC_         number of new tokens found
+ _BAYESTCLEARNED_  number of seen tokens found
+ _BAYESTCSPAMMY_   number of spammy tokens found
+ _BAYESTCHAMMY_    number of hammy tokens found
+ _HAMMYTOKENS(N)_  the N most significant hammy tokens (default, 5)
+ _SPAMMYTOKENS(N)_ the N most significant spammy tokens (default, 5)
  _AWL_             AWL modifier
  _DATE_            rfc-2822 date of scan
  _STARS(*)_        one * (use any character) for each score point (note: this
@@ -89,6 +96,70 @@
  _REPORT_          terse report of tests hit (for header reports)
  _SUMMARY_         summary of tests hit for standard report (for body reports)
  _CONTACTADDRESS_  contents of the 'report_contact' setting
+
+The C<HAMMYTOKENS> and C<SPAMMYTOKENS> tags have an optional second argument
+which specifies a format: C<_SPAMMYTOKENS(N,FMT)_>, C<_HAMMYTOKENS(N,FMT)_>
+The following formats are available:
+
+=over 4
+
+=item short
+
+Only the tokens themselves are listed.
+I<For example, preference file entry:>
+
+C<add_header all Spammy _SPAMMYTOKENS(2,short)_>
+
+I<Results in message header:>
+
+C<X-Spam-Spammy: remove.php, UD:jpg>
+
+Indicating that the top two spammy tokens found are C<remove.php>
+and C<UD:jpg>.  (The token itself follows the last colon, the
+text before the colon indicates something about the token.
+C<UD> means the token looks like it might be part of a domain name.)
+
+=item compact
+
+The token probability, an abbreviated declassification distance (see
+example), and the token are listed.
+I<For example, preference file entry:>
+
+C<add_header all Spammy _SPAMMYTOKENS(2,compact)_>
+
+I<Results in message header:>
+
+C<0.989-6--remove.php, 0.988-+--UD:jpg>
+
+Indicating that the probabilities of the top two tokens are 0.989 and
+0.988, respectively.  The first token has a declassification distance
+of 6, meaning that if the token had appeared in at least 6 more ham
+messages it would not be considered spammy.  The C<+> for the second
+token indicates a declassification distance greater than 9.
+
+=item long
+
+Probability, declassification distance, number of times seen in a ham
+message, number of times seen in a spam message, age and the token are
+listed.
+
+I<For example, preference file entry:>
+
+C<add_header all Spammy _SPAMMYTOKENS(2,long)_>
+
+I<Results in message header:>
+
+C<X-Spam-Spammy: 0.989-6--0h-4s--4d--remove.php, 0.988-33--2h-25s--1d--UD:jpg>
+
+In addition to the information provided by the compact option,
+the long option shows that the first token appeared in zero
+ham messages and four spam messages, and that it was last
+seen four days ago.  The second token appeared in two ham messages,
+25 spam messages and was last seen one day ago.
+(Unlike the C<compact> option, the long option shows declassification
+distances that are greater than 9.)
+
+=back
 
 =head1 USER PREFERENCES
 

Modified: incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm
==============================================================================
--- incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm	(original)
+++ incubator/spamassassin/trunk/lib/Mail/SpamAssassin/PerMsgStatus.pm	Tue Apr 27 19:06:23
2004
@@ -869,6 +869,54 @@
   return $text;
 }
 
+sub bayes_report_make_list {
+  my $self = shift;
+  my $info = shift;
+  my $param = shift || "5";
+  my ($limit,$fmt_arg,$more) = split /,/, $param;
+
+  return "Tokens not available." unless defined $info;
+
+  my %formats =
+    ( short => '$t',
+      Short => 'Token: \"$t\"',
+      compact => '$p-$D--$t',
+      Compact => 'Probability $p -declassification distance $D (\"+\" means > 9) --token:
\"$t\"',
+      medium => '$p-$D-$N--$t',
+      long => '$p-$d--${h}h-${s}s--${a}d--$t',
+      Long => 'Probability $p -declassification distance $D --in ${h} ham messages -and
${s} spam messages --$a} days old--token:\"$t\"'
+                );
+
+  my $allow_user_defined = 0;
+  my $raw_fmt =   !$fmt_arg ? '$p-$D--$t'
+                : $allow_user_defined && $fmt_arg =~ m/^\"([^"]+)\"/ ? $1
+                : $formats{$fmt_arg};
+
+  return "Invalid format, must be one of: ".join(",",keys %formats)
+    unless defined $raw_fmt;
+
+  my $fmt = '"'.$raw_fmt.'"';
+  my $amt = $limit < @$info ? $limit : @$info;
+  return "" unless $amt;
+
+  my $Bayes = $self->{main}{bayes_scanner};
+  my $ns = $self->{bayes_nspam};
+  my $nh = $self->{bayes_nham};
+  my $digit = sub { $_[0] > 9 ? "+" : $_[0] };
+  my $now = time;
+
+  join ', ', map {
+    my($t,$prob,$s,$h,$u) = @$_;
+    my $a = int(($now - $u)/(3600 * 24));
+    my $d = $Bayes->compute_declassification_distance($ns,$nh,$s,$h,$prob);
+    my $p = sprintf "%.3f", $prob;
+    my $n = $s + $h;
+    my ($c,$o) = $prob < 0.5 ? ($h,$s) : ($s,$h);
+    my ($D,$S,$H,$C,$O,$N) = map &$digit($_), ($d,$s,$h,$c,$o,$n);
+    eval $fmt;
+  } @{$info}[0..$amt-1];
+}
+
 sub _get_tag_value_for_yesno {
   my $self   = shift;
   
@@ -922,6 +970,33 @@
             BAYES => sub {
               defined($self->{bayes_score}) ?
                         sprintf("%3.4f", $self->{bayes_score}) : "0.5"
+            },
+
+            HAMMYTOKENS => sub {
+              $self->bayes_report_make_list
+                ( $self->{bayes_token_info_hammy}, shift );
+            },
+
+            SPAMMYTOKENS => sub {
+              $self->bayes_report_make_list
+                ( $self->{bayes_token_info_spammy}, shift );
+            },
+
+            TOKENSUMMARY => sub {
+              if( defined $self->{tag_data}{BAYESTC} )
+                {
+                  my $tcount_neutral = $self->{tag_data}{BAYESTCLEARNED}
+                    - $self->{tag_data}{BAYESTCSPAMMY}
+                    - $self->{tag_data}{BAYESTCHAMMY};
+                  my $tcount_new = $self->{tag_data}{BAYESTC}
+                    - $self->{tag_data}{BAYESTCLEARNED};
+                  "Tokens: new, $tcount_new; "
+                    ."hammy, $self->{tag_data}{BAYESTCHAMMY}; "
+                    ."neutral, $tcount_neutral; "
+                    ."spammy, $self->{tag_data}{BAYESTCSPAMMY}."
+                } else {
+                  "Bayes not run.";
+                }
             },
 
             DATE => \&Mail::SpamAssassin::Util::time_to_rfc822_date,

Mime
View raw message