spamassassin-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From j.@apache.org
Subject svn commit: r570384 - /spamassassin/trunk/masses/rule-qa/mk-ruleqa-reports
Date Tue, 28 Aug 2007 11:30:47 GMT
Author: jm
Date: Tue Aug 28 04:30:47 2007
New Revision: 570384

URL: http://svn.apache.org/viewvc?rev=570384&view=rev
Log:
replacement for corpus-hourly, redesigned to be more suitable for the current ruleqa system

Added:
    spamassassin/trunk/masses/rule-qa/mk-ruleqa-reports   (with props)

Added: spamassassin/trunk/masses/rule-qa/mk-ruleqa-reports
URL: http://svn.apache.org/viewvc/spamassassin/trunk/masses/rule-qa/mk-ruleqa-reports?rev=570384&view=auto
==============================================================================
--- spamassassin/trunk/masses/rule-qa/mk-ruleqa-reports (added)
+++ spamassassin/trunk/masses/rule-qa/mk-ruleqa-reports Tue Aug 28 04:30:47 2007
@@ -0,0 +1,627 @@
+#!/local/perl586/bin/perl -w
+
+# settings are located in $HOME/.corpus
+
+use strict;
+use Getopt::Long;
+use File::Path;
+use File::Copy;
+use Time::ParseDate;
+use Cwd;
+use POSIX qw(nice strftime);
+
+sub usage {
+  die "
+usage: mk-ruleqa-reports [--tag=(n|b)] [--dir=/path/to/files] [--copylogs]
+  [--reports='...'] [--datadir=/path/to/data] [--daterev=daterev]
+
+--copylogs copies log files to data (log, report) storage.
+One of --copylogs or --reports is required.  Both are ok.
+
+reports can be one or more of: 
+  DETAILS.new DETAILS.age DETAILS.all NET.new NET.all NET.age SCOREMAP.new
+
+see http://wiki.apache.org/spamassassin/DateRev for info on daterevs.
+
+";
+}
+
+use vars qw(
+    $opt_dir $opt_reports $opt_tag $opt_copylogs $opt_datadir $opt_daterev
+);
+GetOptions(
+    "tag=s" => \$opt_tag,
+    "dir=s" => \$opt_dir,
+    "daterev=s" => \$opt_daterev,
+    "reports=s" => \$opt_reports,
+    "datadir=s" => \$opt_datadir,
+    'copylogs' => \$opt_copylogs
+);
+
+$opt_tag ||= 'n';       # nightly is the default
+usage() unless ($opt_reports or $opt_copylogs);
+
+nice(15);
+
+# what's the max age of mail we will accept data from? (in weeks)
+# TODO: maybe this should be in ~/.corpus
+my $OLDEST_HAM_WEEKS    = 52 * 5;       # 5 years
+my $OLDEST_SPAM_WEEKS    = 6 * 4;       # 6 months
+
+# daterevs -- e.g. "20060429/r239832-r" -- are aligned to 0800 UTC, just before
+# the time of day when the mass-check tagging occurs; see
+# http://wiki.apache.org/spamassassin/DateRev for more details.
+use constant DATEREV_ADJ => - (8 * 60 * 60);
+use constant WEEK => (7*60*60*24);
+
+# ---------------------------------------------------------------------------
+
+my %conf = ();
+$ENV{TZ} = 'UTC';
+configure("$ENV{HOME}/.corpus");
+
+$opt_dir ||= $conf{corpus};
+$opt_datadir ||= $conf{html};
+
+# logfile metadata
+my %time = ();
+my %dateline = ();
+my %revision = ();
+my %filesize = ();
+my %mtime = ();
+my $time_start = time;
+my $output_revpath;
+
+my %logs_by_daterev = ();
+my %is_net_daterev = ();
+
+my @tmps = ();
+$SIG{INT} = sub { clean_up(); die "SIGINT"; };
+$SIG{TERM} = sub { clean_up(); die "SIGTERM"; };
+$ENV{TIME} = '%e,%U,%S';
+
+$opt_copylogs and copylogs();
+mk_reports();
+clean_up();
+exit;
+
+# ---------------------------------------------------------------------------
+
+sub configure {
+  my $configuration = shift;
+  # does rough equivalent of source
+  open(C, $configuration) || die "open failed: $configuration: $!\n";
+  my $pwd = Cwd::getcwd;
+
+  while (<C>) {
+    chomp;
+    s/#.*//;
+    if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) {
+      my ($key, $val) = ($1, $2);
+      $val =~ s/\$PWD/$pwd/gs;
+      $conf{$key} = $val;
+    }
+  }
+  close(C);
+  $conf{tmp} ||= "/tmp";
+}
+
+# ---------------------------------------------------------------------------
+
+sub copylogs {
+  print "copying logs from: $opt_dir\n";
+
+  opendir(LOGDIR, $opt_dir);
+  my @files = readdir(LOGDIR);
+  closedir(LOGDIR);
+
+  @files = grep {
+    /^(?:spam|ham)-(?:net-)?[-\w]+\.log$/i && -f "$opt_dir/$_"
+                # && -M _ < 10
+  } @files;
+
+  foreach my $file (sort @files) {
+    parse_file_metadata($file);
+  }
+
+  foreach my $daterev (sort keys %logs_by_daterev) {
+    print STDERR "\ncopying to: $opt_datadir/$daterev/LOGS.all\n";
+
+    my @ham = grep { /^ham/ } @{$logs_by_daterev{$daterev}};
+    my @spam = grep { /^spam/ } @{$logs_by_daterev{$daterev}};
+    print STDERR "input h: " . join(' ', @ham) . "\n";
+    print STDERR "input s: " . join(' ', @spam) . "\n";
+
+    my $dir = create_outputdir($daterev);
+    my $bytes = 0;
+
+    foreach my $f (@ham, @spam) {
+      $f =~ s/[^-\._A-Za-z0-9]+/_/gs;    # sanitize!
+      my $srcf = "$opt_dir/$f";
+      my $zf = "$dir/LOGS.all-$f.gz";
+
+      my $targetfreshness = (-M $zf);
+      my $srcfreshness = (-M $srcf);
+      next if ($targetfreshness > $srcfreshness && !-z $zf);
+
+      my $when = scalar localtime time;
+      print "creating: $zf\nstarted $when...\n";
+
+      system("gzip -c < '$srcf' > '$zf.$$'");
+      if ($? >> 8 != 0) {
+        warn "gzip -c < '$srcf' > '$zf.$$' failed";
+      }
+
+      rename("$zf.$$", $zf) or
+                    warn "cannot rename $zf.$$ to $zf";
+      $bytes += (-s $zf);
+    }
+
+    my $when = scalar localtime time;
+    print "created: $bytes bytes, finished at $when\n";
+  }
+}
+
+# ---------------------------------------------------------------------------
+
+sub mk_reports {
+  # find each daterev's data directory:
+  # /home/automc/corpus/html/20070609/r545837-b/
+
+  my $threshold_date = return strftime("%Y%m%d", gmtime(time - (24*60*60*30)));
+  foreach my $datedir (<$conf{html}/*>) {
+    next unless ($datedir =~ /\/(2\d+)$/);
+    next if ($threshold_date && $1 < $threshold_date);
+
+    foreach my $revdir (<$datedir/r*>) {
+      if ($opt_daterev) {
+        $revdir =~ m!/(2\d+/r\d+-\S)$!;
+        my $dr = $1;
+        $dr =~ s!/!-!g;
+        if ($dr ne $opt_daterev) {
+          warn "skipping daterev: $dr != $opt_daterev\n";
+          next;
+        }
+      }
+
+      next unless -d $revdir;
+      report_dir($revdir);
+    }
+  }
+}
+
+sub report_dir {
+  my $dir = shift;
+
+  opendir(LOGDIR, $dir);
+  my @files = readdir(LOGDIR);
+  closedir(LOGDIR);
+
+  @files = grep {
+    # LOGS.all-spam-mc-fast.log.gz
+    /^LOGS\.all-(?:spam|ham)-\S+\.log\.gz$/i && -f "$dir/$_"
+                # && -M _ < 10
+  } @files;
+
+  foreach my $file (sort @files) {
+    parse_file_metadata($file);
+  }
+
+
+  foreach my $daterev (sort keys %logs_by_daterev) {
+    my $rev;
+    if ($daterev !~ /\/r(\d+)/) {
+      warn "bad daterev: $daterev"; next;
+    }
+    $rev = $1;
+
+    foreach my $entry (split(' ', $opt_reports)) {
+      my ($class, $age) = ($entry =~ /^(\S+)\.(\S+)$/);
+      if (!$age) { warn "no age: $entry"; next; }
+      if ($class eq 'HTML') { warn "class HTML obsolete: $entry"; next; }
+      if ($class eq "NET" && !$is_net_daterev{$daterev}) { next; }
+
+      eval {
+        gen_class ($daterev, $rev, $class, $age);
+        1;
+      } or warn "gen_class failed: $@ $!";
+    }
+  }
+}
+
+our $no_msgs;
+sub gen_class {
+  my ($daterev, $rev, $class, $age) = @_;
+
+  $daterev =~ /r(\d+)/;
+  get_rulemetadata_for_revision($daterev, $1);
+
+  my $reportdir = "$opt_datadir/$daterev";
+  print STDERR "\ngenerating: $reportdir/$class.$age\n";
+
+  my @ham = grep { /LOGS\.all-ham/ } @{$logs_by_daterev{$daterev}};
+  my @spam = grep { /LOGS\.all-spam/ } @{$logs_by_daterev{$daterev}};
+
+  print STDERR "input h: " . join(' ', @ham) . "\n";
+  print STDERR "input s: " . join(' ', @spam) . "\n";
+
+  # net vs. local
+  if ($class eq "NET") {
+    @ham = grep { /-net-/ } @ham;
+    @spam = grep { /-net-/ } @spam;
+  }
+
+  print STDERR "selected h: " . join(' ', @ham) . "\n";
+  print STDERR "selected s: " . join(' ', @spam) . "\n";
+
+  # we cannot continue if we have no files that match the criteria...
+  # demand at least 1 ham and 1 spam file
+  if (scalar @spam <= 0 || scalar @ham <= 0) {
+    die "not enough files found matching criteria ($daterev $class $age)\n";
+  }
+
+  my $dir = create_outputdir($daterev);
+  my $fname = "$dir/$class.$age";
+
+  # now, if the target file already exists, check to see if it's newer
+  # than all the sources, make-style; if not, don't re-create it
+  if (-f $fname) {
+    my $targetfreshness = (-M $fname);
+    my $needsrebuild = 0;
+
+    foreach my $srcfile (@spam, @ham) {
+      my $srcfreshness = (-M "$opt_dir/$srcfile");
+      if ($targetfreshness > $srcfreshness) {     # src is fresher
+        print "$fname is older than $opt_dir/$srcfile: $targetfreshness > $srcfreshness\n";
+        $needsrebuild = 1;
+        last;
+      }
+    }
+
+    if (!$needsrebuild) {
+      print "existing: $fname, fresher than sources\n";
+      return;
+    }
+  }
+
+  my $when = scalar localtime time;
+  print qq{creating: $fname
+  started $when...
+};
+  my $bytes = 0;
+
+  my $tmpfname = "$dir/$fname.$$";
+
+  open(OUT, "> $tmpfname") or warn "cannot write to $tmpfname";
+  print OUT "# ham results used for $daterev $class $age: ".join(" ", @ham)."\n";
+  print OUT "# spam results used for $daterev $class $age: ".join(" ", @spam)."\n";
+  print OUT "# ".log_metadata_xml($daterev, @ham, @spam)."\n";
+
+  push @tmps, Cwd::abs_path($tmpfname);
+
+  my @pathspam = map { "$dir/$_" } @spam;
+  my @pathham = map { "$dir/$_" } @spam;
+
+  my $flags = "";
+  $flags = "-t net -s 1" if $class eq "NET";
+  $flags = "-o" if $class eq "OVERLAP";
+  $flags = "-S" if $class eq "SCOREMAP";
+  if ($conf{rules_dir}) {
+    $flags .= " -c '$conf{rules_dir}'";
+  }
+
+  # are we analyzing --net mass-check logs?  if so, use scoreset 1
+  # jm: always use set 1 if the logs allow it
+  if (join(" ", @ham) =~ /-net-/) {
+    $flags .= " -s 1"; # if $class eq "NET";
+  }
+
+  # catch an odd error condition, where hit-frequencies creates output
+  # with no log lines included at all
+  $no_msgs = 0;
+
+  my $tmp_h = "$conf{tmp}/ham.log.$$";
+  my $tmp_s = "$conf{tmp}/spam.log.$$";
+  unlink $tmp_h; unlink $tmp_s;
+
+  my @output = ();
+
+  if ($age eq "all") {
+    my %spam;
+    my %ham;
+    for my $file (@pathspam) {
+      $spam{$1} = $file if ($file =~ m/-(\w[-\w]+)\.log$/);
+    }
+    for my $file (@pathham) {
+      $ham{$1} = $file if ($file =~ m/-(\w[-\w]+)\.log$/);
+    }
+    if (scalar keys %spam <= 0 || scalar keys %ham <= 0) {
+      warn "no files found for $class.$age"; return;
+    }
+
+    my $tmp_all_h = $tmp_h.".all.$$";
+    my $tmp_all_s = $tmp_s.".all.$$";
+    for my $user (sort keys %spam) {
+      next unless $ham{$user};
+      time_filter_fileset([ "$opt_dir/$ham{$user}" ],
+              $tmp_h, $OLDEST_HAM_WEEKS, undef);
+      time_filter_fileset([ "$opt_dir/$spam{$user}" ],
+              $tmp_s, $OLDEST_SPAM_WEEKS, undef);
+
+      push @output, run_hit_frequencies($flags, $user, $tmp_s, $tmp_h);
+      system("cat $tmp_h >> $tmp_all_h");
+      system("cat $tmp_s >> $tmp_all_s");
+    }
+
+    push @output, run_hit_frequencies($flags, undef, $tmp_all_s, $tmp_all_h);
+    for (sort sort_all @output) { print OUT; }
+    unlink ($tmp_all_h, $tmp_all_s);
+  }
+  elsif ($age eq "age") {
+    for my $which (("0-1", "1-2", "2-3", "3-6")) {
+      my ($before, $after) = split(/-/, $which);
+      time_filter_fileset(\@pathham, $tmp_h, $after, $before);
+      time_filter_fileset(\@pathspam, $tmp_s, $after, $before);
+      push @output, run_hit_frequencies($flags, $which, $tmp_s, $tmp_h);
+    }
+    for (sort sort_all @output) { print OUT; }
+  }
+  elsif ($age eq "new") {
+    time_filter_fileset(\@pathham, $tmp_h, $OLDEST_HAM_WEEKS, undef);
+    time_filter_fileset(\@pathspam, $tmp_s, $OLDEST_SPAM_WEEKS, undef);
+    print OUT run_hit_frequencies($flags, undef, $tmp_s, $tmp_h);
+  }
+  else {
+    warn "bad age $age";
+  }
+
+  if ($no_msgs) {
+    warn "ERROR: no data in freqs!  aborting, leaving tmp file as $tmpfname";
+    return;
+  }
+
+  $bytes = (-s OUT);
+  close(OUT);
+  rename($tmpfname, $fname) or warn "cannot rename $tmpfname to $fname";
+  unlink ($tmp_h, $tmp_s);
+
+  # compress for certain classes
+  if ($class eq "OVERLAP") {
+    $fname =~ s/'//gs;
+    system ("gzip '$fname'");
+    # takes care of keeping the original around so we don't have to
+    if ($? >> 8 != 0) { warn "gzip '$fname' failed"; }
+  }
+
+  $when = scalar localtime time;
+  print qq{created: $bytes bytes, finished at $when
+URL:
+
+  $conf{ruleqa_url}$output_revpath
+
+};
+
+}
+
+sub run_hit_frequencies {
+  my ($flags, $suffix, $spamf, $hamf) = @_;
+
+  if (-z $hamf && -z $spamf) {
+    warn "time_filter_fileset() returned empty logs. not creating freqs!";
+    return;     # we'll try again later
+  }
+
+  my @output = ();
+  my $origwd = Cwd::getcwd;
+  {
+    chdir "$conf{tree}/masses" or die "cannot chdir $conf{tree}/masses";
+
+    print "[hit-frequencies -TxpaP $flags '$spamf' '$hamf']\n";
+    open(IN, "./hit-frequencies -TxpaP $flags '$spamf' '$hamf' |");
+
+    while(<IN>) {
+      chomp;
+      /\s0\s+0\s+0.500\s+0.00\s+0.00\s+\(all messages\)/ and $no_msgs = 1;
+      push @output, $_ . ($suffix ? ":$suffix" : "") . "\n";
+    }
+    close(IN) or die "hit-frequencies failed";
+    chdir $origwd or die "cannot return to $origwd";
+  }
+
+  return @output;
+}
+
+# ---------------------------------------------------------------------------
+
+sub sort_all {
+  my ($a1, $a2) = ($a =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
+  my ($b1, $b2) = ($b =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
+  $a1 =~ s/^[\+\-]//;
+  $b1 =~ s/^[\+\-]//;
+
+  my $n = ($a1 cmp $b1) || (($a2 || '') cmp ($b2 || ''));
+  if ($a1 =~ /^OVERALL/)			{ $n -= 1000; }
+  elsif ($a1 =~ /^\(all messages\)/)		{ $n -= 100; }
+  elsif ($a1 =~ /^\(all messages as \%\)/)	{ $n -= 10; }
+  if ($b1 =~ /^OVERALL/)			{ $n += 1000; }
+  elsif ($b1 =~ /^\(all messages\)/)		{ $n += 100; }
+  elsif ($b1 =~ /^\(all messages as \%\)/)	{ $n += 10; }
+  return $n;
+}
+
+sub time_filter_fileset {
+  my ($fileary, $outname, $after, $before) = @_;
+
+  my $timet_before = (defined $before ? ($time_start - ($before * WEEK)) : $time_start+1);
+  my $timet_after  = (defined $after ? ($time_start - ($after * WEEK)) : 0);
+
+  open(TMP, "> $outname") or warn "cannot write $outname";
+
+  for my $file (@{$fileary}) {
+    if ($file =~ /\.gz$/) {
+      open(FILE, "gunzip -cd < '$file' |") or warn "cannot gunzip $file";
+    } else {
+      open(IN, $file) or warn "cannot read $file";
+    }
+    while (<IN>) {
+      next unless /\btime=(\d+)/;
+      next if ($1 < $timet_after || $1 > $timet_before);
+      print TMP;
+    }
+    close IN;
+  }
+  close TMP or warn "failed to close $outname";
+}
+
+# ---------------------------------------------------------------------------
+
+sub parse_file_metadata {
+  my $file = shift;
+
+  my $tag = 0;
+  my $headers = '';
+
+  if ($file =~ /\.gz$/) {
+    open(FILE, "gunzip -cd < '$opt_dir/$file' |") or warn "cannot read $opt_dir/$file";
+  } else {
+    open(FILE, "$opt_dir/$file") or warn "cannot read $opt_dir/$file";
+  }
+
+  while (my $line = <FILE>) {
+    last if $line !~ /^#/;
+    $headers .= $line;
+    if ($line =~ /, on (... ... .. )(..)(:..:.. ... ....)/) {
+      my ($datepre, $hh, $datepost) = ($1,$2,$3);
+
+      my $timet = Time::ParseDate::parsedate($datepre.$hh.$datepost,
+                  GMT => 1, PREFER_PAST => 1);
+
+      $time{$file} = $timet;
+    }
+    elsif ($line =~ m/^# Date:\s*(\S+)/) {
+      # a better way to do the above.  TODO: parse it instead
+      $dateline{$file} = $1;
+    }
+    elsif ($line =~ m/^# SVN revision:\s*(\S+)/) {
+      $revision{$file} = $1;
+    }
+  }
+  close(FILE);
+
+  my @s = stat("$opt_dir/$file");
+  $filesize{$file} = $s[7];
+  $mtime{$file} = $s[9];
+
+  if (!defined $time{$file}) {
+    warn "$opt_dir/$file: no time found, ignored\n"; next;
+  }
+  if (!defined $revision{$file}) {
+    warn "$opt_dir/$file: no revision found, ignored\n"; next;
+  }
+  if ($revision{$file} eq 'unknown') {
+    warn "$opt_dir/$file: not tagged with a revision, ignored\n"; next;
+  }
+
+  my $daterev = mk_daterev($time{$file},$revision{$file},$opt_tag);
+
+  $logs_by_daterev{$daterev} ||= [ ];
+  push (@{$logs_by_daterev{$daterev}}, $file);
+
+  if ($file =~ /-net-/) {
+    $is_net_daterev{$daterev} = 1;
+    print "$opt_dir/$file: rev=$daterev time=$time{$file} (set 1)\n";
+  }
+  else {
+    print "$opt_dir/$file: rev=$daterev time=$time{$file} (set 0)\n";
+  }
+}
+
+# ---------------------------------------------------------------------------
+
+sub mk_daterev {
+  my ($timet, $rev, $tag) = @_;
+  return strftime("%Y%m%d", gmtime($timet + DATEREV_ADJ)) . "/r$rev-$tag";
+}
+
+sub create_outputdir {
+  my ($revpath) = @_;
+  my $dir = $conf{html} .'/'. $revpath;
+
+  # print "output dir: $dir\n";
+  if (!-d $dir) {
+    my $prevu = umask 0;
+    mkpath([$dir], 0, (oct($conf{html_mode}) || 0755))
+                            or warn "failed to mkdir $dir";
+    umask $prevu;
+  }
+
+  $output_revpath = $revpath;       # set the global
+  $output_revpath =~ s/\//-/;       # looks nicer
+
+  return $dir;
+}
+
+# ---------------------------------------------------------------------------
+
+sub clean_up {
+  chdir "/";
+  system "rm -rf $conf{tmp}/*.$$ '".join("' '", @tmps)."'";
+}
+
+# ---------------------------------------------------------------------------
+
+sub log_metadata_xml {
+  my ($daterev, @files) = @_;
+  my $str = '';
+
+  # this is extracted into the info.xml file later by the gen_info_xml script
+  foreach my $f (@files) {
+    $str .= qq{
+      <mclogmd file='$f'>
+        <daterev>$daterev</daterev>
+        <rev>$revision{$f}</rev>
+        <fsize>$filesize{$f}</fsize>
+        <mcstartdate>$dateline{$f}</mcstartdate>
+        <mtime>$mtime{$f}</mtime>
+      </mclogmd>
+    };
+  }
+
+  $str =~ s/\s+/ /gs;  # on a single line please
+  return '<mclogmds>'.$str.'</mclogmds>';
+}
+
+sub get_rulemetadata_for_revision {
+  my ($daterev, $rev) = @_;
+
+  my $dir = create_rulemetadata_dir($rev);
+
+  # argh.  this is silly; ~bbmass/.corpus specifies "$PWD" in its
+  # "tree" path, so we have to ensure we're in the 'masses' dir
+  # for this to work!
+  {
+    my $origwd = Cwd::getcwd;
+    chdir "$conf{tree}/masses" or die "cannot chdir $conf{tree}/masses";
+
+    my $cmd = "$./rule-qa/get-rulemetadata-for-revision ".
+                      "--rev=$rev --outputdir='$dir'";
+
+    system($cmd);
+    if ($? >> 8 != 0) { warn "'$cmd' failed"; }
+
+    chdir $origwd or die "cannot return to $origwd";
+  }
+}
+
+sub create_rulemetadata_dir {
+  my $rev = shift;
+  my $dir = "$opt_datadir/rulemetadata/$rev";
+  if (!-d $dir) {
+    my $prevu = umask 0;
+    mkpath([$dir], 0, (oct($conf{html_mode}) || 0755)) or
+                            warn "failed to mkdir $dir";
+    umask $prevu;
+  }
+  return $dir;
+}
+

Propchange: spamassassin/trunk/masses/rule-qa/mk-ruleqa-reports
------------------------------------------------------------------------------
    svn:executable = *



Mime
View raw message