logs-to-c   [plain text]


#!/usr/bin/perl -w
#
# <@LICENSE>
# Copyright 2004 Apache Software Foundation
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

use Getopt::Long;
use vars qw($opt_cffile $opt_count $opt_lambda $opt_threshold
		$opt_spam $opt_ham $opt_fplog $opt_fnlog);

GetOptions("cffile=s", "count", "lambda=f", "threshold=f", "spam=s", "ham=s", "scoreset=i", "fplog=s", "fnlog=s");

my $argcffile = $opt_cffile;

my $justcount = 0;
if ($opt_count) { $justcount = 1; }

my $threshold = 5;
if (defined $opt_threshold) { $threshold = $opt_threshold; }

$opt_spam ||= 'spam.log';
$opt_ham ||= 'ham.log';
$opt_scoreset = 0 if ( !defined $opt_scoreset );

# If desired, report false positives and false negatives for analysis
if (defined $opt_fnlog) { open (FNLOG, ">$opt_fnlog"); }
if (defined $opt_fplog) { open (FPLOG, ">$opt_fplog"); }

my $nybias = 10;

# lambda value for TCR equation, representing the cost of of an FP vs. the
# cost of a FN.  Some example values are: 1 = tagged only, 9 = mailed back
# to sender asking for token, 999 = blocking or deleting a message.
#
# We roughly aim for a value representing "moved to infrequently-read folder".

my $lambda = 50;
if ($opt_lambda) { $lambda = $opt_lambda; }

my %is_spam = ();
my %tests_hit = ();
my %mutable_tests = ();

use vars qw(%rules %allrules);

readscores();

print "Reading per-message hit stat logs and scores...\n";
my ($num_tests, $num_spam, $num_ham);
my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore);

read_ranges();
readlogs();

if ($justcount) {
  $nybias = $nybias*($num_spam / $num_ham);
  evaluate();
} else {
  print "Writing logs and current scores as C code...\n";
  writescores_c();
}
exit 0;


sub readlogs {
  my $count = 0;
  $num_spam = $num_ham = 0;

  if ($justcount) {
    $ga_yy = $ga_ny = $ga_yn = $ga_nn = 0;
    $yyscore = $ynscore = $nyscore = $nnscore = 0.0;
  }

  foreach my $file ($opt_spam, $opt_ham) {
    open (IN, "<$file");

    while (<IN>) {
      next unless /^[^#]/;
      if($_ !~ /^.\s+([-\d]+)\s+(\S+)\s*/) { warn "bad line: $_"; next; }
      my $msgline = $_;
      my $hits = $1;
      #my $id = $2;
      $_ = $'; s/(?:bayes|time)=\S+//; s/,,+/,/g; s/^\s+//; s/\s+$//;

      my $score = 0;
      my @tests = ();
      foreach my $tst (split (/,/, $_)) {
	next unless $tst;
	if (!defined $scores{$tst}) {
          #warn "unknown test in $file, ignored: $tst\n";
	  next;
	}

	# Make sure to skip any subrules!
	next if ( $allrules{$tst}->{issubrule} );

        if ($justcount) {
          $score += $scores{$tst};
        } else {
          push (@tests, $tst);
        }
      }

      if (!$justcount) { 
        $tests_hit{$count} = \@tests;
      }

      if ($file eq $opt_spam) {
	$num_spam++;
        if ($justcount) {
          if ($score >= $threshold) {
            $ga_yy++; $yyscore += $score;
          } else {
            $ga_yn++; $ynscore += $score;
	    if (defined $opt_fnlog) {
	    	print FNLOG $msgline;
	    }
          }
        } else {
          $is_spam{$count} = 1;
        }
      } else {
	$num_ham++;
        if ($justcount) {
          if ($score >= $threshold) {
	    #print STDERR "FP: $id\n";
            $ga_ny++; $nyscore += $score;
	    if (defined $opt_fplog) {
	    	print FPLOG $msgline;
	    }
          } else {
            $ga_nn++; $nnscore += $score;
          }
        } else {
          $is_spam{$count} = 0;
        }
      }
      $count++;
    }
    close IN;
  }
  $num_tests = $count;
}


sub readscores {
  if (!defined $argcffile) { $argcffile = "../rules"; }
  print "Reading scores from \"$argcffile\"...\n";
  system ("./parse-rules-for-masses -d \"$argcffile\" -s $opt_scoreset") and die;
  require "./tmp/rules.pl";
  %allrules = %rules;           # ensure it stays global
}

sub writescores_c {
  my $output = '';
  my $size = 0;
  my $mutable = 0;
  my $i;

    # jm: now, score-ranges-from-freqs has tflags to work from, so
    # it will always list all mutable tests.

  @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) ||
			  ($mutable_tests{$b} <=> $mutable_tests{$a}) ||
			   ($a cmp $b)} (keys %scores);
  my $max_hits_per_msg = 0;
  for ($file = 0; $file < $num_tests; $file++) {
    my(@hits) =
     grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (@{$tests_hit{$file}});
    if ((scalar(@hits)+1) > $max_hits_per_msg) {
      $max_hits_per_msg = scalar(@hits)+1;
    }
  }

  for ($i = 0; $i <= $#index_to_rule; $i++) {
    my $name = $index_to_rule[$i];
    $rule_to_index{$name} = $i;

    if ($ignored_rule{$name}) { next; }

    if ($mutable_tests{$name} == 0) {
      $range_lo{$name} = $range_hi{$name} = $scores{$name};
    } else {
      $mutable++;
      if ($range_lo{$name} > $range_hi{$name}) {
	($range_lo{$name},$range_hi{$name}) =
	 ($range_hi{$name},$range_lo{$name});
      }
      #$range_lo{$name} ||= 0.1;
      #$range_hi{$name} ||= 1.5;
    }

    $output .= ".".$i."\n".
                "n".$name."\n".
                "b".$scores{$name}."\n".
                "m".$mutable_tests{$name}."\n".
                "l".$range_lo{$name}."\n".
                "h".$range_hi{$name}."\n";
    $size++;
  }


  open (DAT, ">tmp/scores.data");
  print DAT "N$size\n", "M$mutable\n", # informational only
   $output;
  close DAT;

  open (OUT, ">tmp/scores.h");
  print OUT "
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

int num_scores = $size;
int num_mutable = $mutable;
unsigned char is_mutable[$size];
double range_lo[$size];
double range_hi[$size];
double bestscores[$size];
char *score_names[$size];
double tmp_scores[$size][2];
unsigned char ny_hit[$mutable];
unsigned char yn_hit[$mutable];

double lookup[$mutable];

/* readscores() is defined in tests.h */

";
  close OUT;

  writetests_c($max_hits_per_msg); # make sure $rule_to_index is around
}

sub writetests_c {
  my $max_hits_per_msg = $_[0];

  my(%uniq_files) = ();
  my(%count_keys) = ();
  my(%file_key) = ();

  my $file;

  for ($file = 0; $file < $num_tests; $file++)
  {
    my $uniq_key = $is_spam{$file} . " ";

    my(@good_tests) =
     grep {length($_) && (! $ignored_rule{$_}) &&
	    (defined($rule_to_index{$_}))} (@{ $tests_hit{$file} });

    @good_tests = sort {$a <=> $b} (map {$rule_to_index{$_}} (@good_tests));

    $uniq_key .= join(" ",@good_tests);

    if (exists($count_keys{$uniq_key})) {
      $count_keys{$uniq_key}++;
    } else {
      $count_keys{$uniq_key} = 1;
      $file_key{$file} = $uniq_key;
      $uniq_files{$file} = scalar(keys(%count_keys)) - 1;
    }
  }

  my $num_nondup = scalar(keys(%uniq_files));

  open (TOP, ">tmp/tests.h");
  print TOP "
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

int num_tests = $num_tests;
int num_nondup = $num_nondup;
int num_spam = $num_spam;
int num_ham = $num_ham;
int max_hits_per_msg = $max_hits_per_msg;
unsigned char num_tests_hit[$num_nondup];
unsigned char is_spam[$num_nondup];
unsigned short tests_hit[$num_nondup][$max_hits_per_msg];
double scores[$num_nondup];
double tmp_total[$num_nondup];
int tests_count[$num_nondup];

";
  $_ = join ('', <DATA>);
  print TOP $_;
  close TOP;

  open (DAT, ">tmp/tests.data");

  foreach $file (sort {$a <=> $b} (keys %uniq_files)) {
    print DAT ".".$uniq_files{$file}."\n";

    my $out = '';
    $out .= "s".$is_spam{$file}."\n";

    my $base_score = 0;
    my $num_tests_hit = 0;
    foreach my $test (@{$tests_hit{$file}}) {
      if ($test eq '') { next; }

      if ($ignored_rule{$test}) {
        warn "ignored rule $test got a hit in $file!\n";
        next;
      }

      if (!defined $rule_to_index{$test}) {
	warn "test with no C index: $test\n";
	next;
      }

      if ($mutable_tests{$test}) {
      $num_tests_hit++;
      $out .= "t".$rule_to_index{$test}."\n";

      if ($num_tests_hit >= $max_hits_per_msg) {
	die "Need to increase \$max_hits_per_msg";
      }
      } else {
	$base_score += $scores{$test};
      }
    }

    $out .= "b" . $base_score . "\n"; # score to add in for non-mutable tests
    $out .= "c" . $count_keys{$file_key{$file}} . "\n";

    print DAT "n".$num_tests_hit."\n".$out;
  }
  close DAT;
}

sub read_ranges {
  if (!-f 'tmp/ranges.data') {
    system ("make tmp/ranges.data");
  }

  # read ranges, and mutableness, from ranges.data.
  open (IN, "<tmp/ranges.data")
  	or die "need to run score-ranges-from-freqs first!";

  my $count = 0;
  while (<IN>) {
    /^(\S+) (\S+) (\d+) (\S+)$/ or next;
    my $t = $4;
    $range_lo{$t} = $1+0;
    $range_hi{$t} = $2+0;
    my $mut = $3+0;

    if ($allrules{$t}->{issubrule}) {
      # warn "ignoring '$t': is sub-rule\n";    # no need to warn
      $ignored_rule{$t} = 1;
      $mutable_tests{$t} = 0;
      next;
    }
    if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) {
      warn "ignoring '$t': score and range == 0\n";
      $ignored_rule{$t} = 1;
      $mutable_tests{$t} = 0;
      next;
    }

    $ignored_rule{$t} = 0;
    $index_to_rule[$count] = $t;
    $count++;

    if (!$mut) {
      $mutable_tests{$t} = 0;
    } elsif ($range_lo{$t} == $range_hi{$t}) {
      $mutable_tests{$t} = 0;
    } elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
      $mutable_tests{$t} = 0;
    } else {
      $mutable_tests{$t} = 1;
    }
    unless ($mutable_tests{$t} || $scores{$t}) {
      warn "ignoring '$t': immutable and score == 0\n";
      $ignored_rule{$t} = 1;
    }
  }
  close IN;

  # catch up on the ones missed; seems to be userconf or 0-hitters mostly.
  foreach my $t (sort keys %allrules) {
    next if (exists($range_lo{$t}));
    if ($allrules{$t}->{issubrule}) {
      if (!$ignored_rule{$t}) {
        # warn "ignoring '$t': is sub-rule\n";  # no need to warn here
        $ignored_rule{$t} = 1;
      }
      $mutable_tests{$t} = 0;
      next;
    }
    $ignored_rule{$t} = 0;
    unless (exists($mutable_tests{$t}) &&
	    ($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) {
      $mutable_tests{$t} = 0;
    }
    unless ($mutable_tests{$t} || $scores{$t}) {
      if (!$ignored_rule{$t}) {
        warn "ignoring '$t': immutable and score == 0\n";
        $ignored_rule{$t} = 1;
      }
    }
    $index_to_rule[$count] = $t;
    $count++;
  }
  foreach my $t (keys %range_lo) {
    next if ($ignored_rule{$t});
    if ($mutable_tests{$t}) {
      if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -1;
      } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -0.01;
      }
      if ($scores{$t} >= $range_hi{$t}) {
	$scores{$t} = $range_hi{$t} - 0.001;
      } elsif ($scores{$t} <= $range_lo{$t}) {
	$scores{$t} = $range_lo{$t} + 0.001;
      }
    } else {
      if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
	next;
      } elsif ($range_lo{$t} == $range_hi{$t}) {
	$scores{$t} = $range_lo{$t};
	next;
      }
      if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -1;
      } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -0.01;
      }
      if ($scores{$t} > $range_hi{$t}) {
	$scores{$t} = $range_hi{$t};
      } elsif ($scores{$t} < $range_lo{$t}) {
	$scores{$t} = $range_lo{$t};
      }
    }
  }
}

sub evaluate {
   printf ("\n# SUMMARY for threshold %3.1f:\n", $threshold);
   printf "# Correctly non-spam: %6d  %4.2f%%\n",
       $ga_nn, ($ga_nn /  $num_ham) * 100.0;
   printf "# Correctly spam:     %6d  %4.2f%%\n",
       $ga_yy, ($ga_yy /  $num_spam) * 100.0;
   printf "# False positives:    %6d  %4.2f%%\n",
       $ga_ny, ($ga_ny /  $num_ham) * 100.0;
   printf "# False negatives:    %6d  %4.2f%%\n",
       $ga_yn, ($ga_yn /  $num_spam) * 100.0;

  # convert to the TCR metrics used in the published lit
  my $nspamspam = $ga_yy;
  my $nspamlegit = $ga_yn;
  my $nlegitspam = $ga_ny;
  my $nlegitlegit = $ga_yn;
  my $nlegit = $num_ham;
  my $nspam = $num_spam;

  my $werr = ($lambda * $nlegitspam + $nspamlegit)
                  / ($lambda * $nlegit + $nspam);

  my $werr_base = $nspam
                  / ($lambda * $nlegit + $nspam);

  $werr ||= 0.000001;     # avoid / by 0
  my $tcr = $werr_base / $werr;

  my $sr = ($nspamspam / $nspam) * 100.0;
  my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
  printf "# TCR(l=%s): %3.6f  SpamRecall: %3.3f%%  SpamPrec: %3.3f%%\n",
    $lambda, $tcr, $sr, $sp;
}

__DATA__

void loadtests (void) {
  FILE *fin = fopen ("tmp/tests.data", "r");
  char buf[256];
  int file = 0;
  int tnum = 0;

  while (fgets (buf, 255, fin) != NULL) {
    char cmd;
    long arg;
    float argd;

    cmd = (char) *buf;
    arg = strtol (buf+1, NULL, 10);
    argd = (float)strtod (buf+1, NULL);

    if (cmd == '.') {
      file = arg;

    } else if (cmd == 'n') {
      tnum = 0;
      num_tests_hit[file] = arg;

    } else if (cmd == 's') {
      is_spam[file] = arg;

    } else if (cmd == 'b') {
      scores[file] = argd;

    } else if (cmd == 't') {
      tests_hit[file][tnum] = arg; tnum++;

    } else if (cmd == 'c') {
      tests_count[file] = arg;

    }
  }
  fclose(fin);

  printf ("Read test results for %d messages (%d total).\n", file+1,
	  num_tests);
}

void loadscores (void) {
  FILE *fin = fopen ("tmp/scores.data", "r");
  char buf[256];
  int snum = 0;

  while (fgets (buf, 255, fin) != NULL) {
    char cmd;
    long arg;
    float argd;
    char *str, *white;

    cmd = (char) *buf;
    arg = strtol (buf+1, NULL, 10);
    argd = (float)strtod (buf+1, NULL);
    str = buf+1;

    while ((white = strchr (str, '\n')) != NULL) {
      *white = '\0';
    }

    if (cmd == '.') {
      snum = arg;

    } else if (cmd == 'b') {
      bestscores[snum] = argd;

    } else if (cmd == 'l') {
      range_lo[snum] = argd;

    } else if (cmd == 'h') {
      range_hi[snum] = argd;

    } else if (cmd == 'n') {
      score_names[snum] = strdup (str);	/* leaky leak ;) */

    } else if (cmd == 'm') {
      is_mutable[snum] = arg;
    }
  }
  fclose(fin);

  printf ("Read scores for %d tests.\n", num_scores);
}