use Getopt::Long;
use vars qw($opt_cffile $opt_count $opt_lambda $opt_threshold
$opt_spam $opt_ham $opt_fplog $opt_fnlog);
GetOptions("cffile=s", "count", "lambda=f", "threshold=f", "spam=s", "ham=s", "scoreset=i", "fplog=s", "fnlog=s");
my $argcffile = $opt_cffile;
my $justcount = 0;
if ($opt_count) { $justcount = 1; }
my $threshold = 5;
if (defined $opt_threshold) { $threshold = $opt_threshold; }
$opt_spam ||= 'spam.log';
$opt_ham ||= 'ham.log';
$opt_scoreset = 0 if ( !defined $opt_scoreset );
if (defined $opt_fnlog) { open (FNLOG, ">$opt_fnlog"); }
if (defined $opt_fplog) { open (FPLOG, ">$opt_fplog"); }
my $nybias = 10;
my $lambda = 50;
if ($opt_lambda) { $lambda = $opt_lambda; }
my %is_spam = ();
my %tests_hit = ();
my %mutable_tests = ();
use vars qw(%rules %allrules);
readscores();
print "Reading per-message hit stat logs and scores...\n";
my ($num_tests, $num_spam, $num_ham);
my ($ga_yy, $ga_ny, $ga_yn, $ga_nn, $yyscore, $ynscore, $nyscore, $nnscore);
read_ranges();
readlogs();
if ($justcount) {
$nybias = $nybias*($num_spam / $num_ham);
evaluate();
} else {
print "Writing logs and current scores as C code...\n";
writescores_c();
}
exit 0;
sub readlogs {
my $count = 0;
$num_spam = $num_ham = 0;
if ($justcount) {
$ga_yy = $ga_ny = $ga_yn = $ga_nn = 0;
$yyscore = $ynscore = $nyscore = $nnscore = 0.0;
}
foreach my $file ($opt_spam, $opt_ham) {
open (IN, "<$file");
while (<IN>) {
next unless /^[^ if($_ !~ /^.\s+([-\d]+)\s+(\S+)\s*/) { warn "bad line: $_"; next; }
my $msgline = $_;
my $hits = $1;
$_ = $'; s/(?:bayes|time)=\S+//; s/,,+/,/g; s/^\s+//; s/\s+$//;
my $score = 0;
my @tests = ();
foreach my $tst (split (/,/, $_)) {
next unless $tst;
if (!defined $scores{$tst}) {
#warn "unknown test in $file, ignored: $tst\n";
next;
}
# Make sure to skip any subrules!
next if ( $allrules{$tst}->{issubrule} );
if ($justcount) {
$score += $scores{$tst};
} else {
push (@tests, $tst);
}
}
if (!$justcount) {
$tests_hit{$count} = \@tests;
}
if ($file eq $opt_spam) {
$num_spam++;
if ($justcount) {
if ($score >= $threshold) {
$ga_yy++; $yyscore += $score;
} else {
$ga_yn++; $ynscore += $score;
if (defined $opt_fnlog) {
print FNLOG $msgline;
}
}
} else {
$is_spam{$count} = 1;
}
} else {
$num_ham++;
if ($justcount) {
if ($score >= $threshold) {
#print STDERR "FP: $id\n";
$ga_ny++; $nyscore += $score;
if (defined $opt_fplog) {
print FPLOG $msgline;
}
} else {
$ga_nn++; $nnscore += $score;
}
} else {
$is_spam{$count} = 0;
}
}
$count++;
}
close IN;
}
$num_tests = $count;
}
sub readscores {
if (!defined $argcffile) { $argcffile = "../rules"; }
print "Reading scores from \"$argcffile\"...\n";
system ("./parse-rules-for-masses -d \"$argcffile\" -s $opt_scoreset") and die;
require "./tmp/rules.pl";
%allrules = %rules; # ensure it stays global
}
sub writescores_c {
my $output = '';
my $size = 0;
my $mutable = 0;
my $i;
# jm: now, score-ranges-from-freqs has tflags to work from, so
# it will always list all mutable tests.
@index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) ||
($mutable_tests{$b} <=> $mutable_tests{$a}) ||
($a cmp $b)} (keys %scores);
my $max_hits_per_msg = 0;
for ($file = 0; $file < $num_tests; $file++) {
my(@hits) =
grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (@{$tests_hit{$file}});
if ((scalar(@hits)+1) > $max_hits_per_msg) {
$max_hits_per_msg = scalar(@hits)+1;
}
}
for ($i = 0; $i <= $#index_to_rule; $i++) {
my $name = $index_to_rule[$i];
$rule_to_index{$name} = $i;
if ($ignored_rule{$name}) { next; }
if ($mutable_tests{$name} == 0) {
$range_lo{$name} = $range_hi{$name} = $scores{$name};
} else {
$mutable++;
if ($range_lo{$name} > $range_hi{$name}) {
($range_lo{$name},$range_hi{$name}) =
($range_hi{$name},$range_lo{$name});
}
#$range_lo{$name} ||= 0.1;
#$range_hi{$name} ||= 1.5;
}
$output .= ".".$i."\n".
"n".$name."\n".
"b".$scores{$name}."\n".
"m".$mutable_tests{$name}."\n".
"l".$range_lo{$name}."\n".
"h".$range_hi{$name}."\n";
$size++;
}
open (DAT, ">tmp/scores.data");
print DAT "N$size\n", "M$mutable\n", # informational only
$output;
close DAT;
open (OUT, ">tmp/scores.h");
print OUT "
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int num_scores = $size;
int num_mutable = $mutable;
unsigned char is_mutable[$size];
double range_lo[$size];
double range_hi[$size];
double bestscores[$size];
char *score_names[$size];
double tmp_scores[$size][2];
unsigned char ny_hit[$mutable];
unsigned char yn_hit[$mutable];
double lookup[$mutable];
/* readscores() is defined in tests.h */
";
close OUT;
writetests_c($max_hits_per_msg); # make sure $rule_to_index is around
}
sub writetests_c {
my $max_hits_per_msg = $_[0];
my(%uniq_files) = ();
my(%count_keys) = ();
my(%file_key) = ();
my $file;
for ($file = 0; $file < $num_tests; $file++)
{
my $uniq_key = $is_spam{$file} . " ";
my(@good_tests) =
grep {length($_) && (! $ignored_rule{$_}) &&
(defined($rule_to_index{$_}))} (@{ $tests_hit{$file} });
@good_tests = sort {$a <=> $b} (map {$rule_to_index{$_}} (@good_tests));
$uniq_key .= join(" ",@good_tests);
if (exists($count_keys{$uniq_key})) {
$count_keys{$uniq_key}++;
} else {
$count_keys{$uniq_key} = 1;
$file_key{$file} = $uniq_key;
$uniq_files{$file} = scalar(keys(%count_keys)) - 1;
}
}
my $num_nondup = scalar(keys(%uniq_files));
open (TOP, ">tmp/tests.h");
print TOP "
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int num_tests = $num_tests;
int num_nondup = $num_nondup;
int num_spam = $num_spam;
int num_ham = $num_ham;
int max_hits_per_msg = $max_hits_per_msg;
unsigned char num_tests_hit[$num_nondup];
unsigned char is_spam[$num_nondup];
unsigned short tests_hit[$num_nondup][$max_hits_per_msg];
double scores[$num_nondup];
double tmp_total[$num_nondup];
int tests_count[$num_nondup];
";
$_ = join ('', <DATA>);
print TOP $_;
close TOP;
open (DAT, ">tmp/tests.data");
foreach $file (sort {$a <=> $b} (keys %uniq_files)) {
print DAT ".".$uniq_files{$file}."\n";
my $out = '';
$out .= "s".$is_spam{$file}."\n";
my $base_score = 0;
my $num_tests_hit = 0;
foreach my $test (@{$tests_hit{$file}}) {
if ($test eq '') { next; }
if ($ignored_rule{$test}) {
warn "ignored rule $test got a hit in $file!\n";
next;
}
if (!defined $rule_to_index{$test}) {
warn "test with no C index: $test\n";
next;
}
if ($mutable_tests{$test}) {
$num_tests_hit++;
$out .= "t".$rule_to_index{$test}."\n";
if ($num_tests_hit >= $max_hits_per_msg) {
die "Need to increase \$max_hits_per_msg";
}
} else {
$base_score += $scores{$test};
}
}
$out .= "b" . $base_score . "\n"; # score to add in for non-mutable tests
$out .= "c" . $count_keys{$file_key{$file}} . "\n";
print DAT "n".$num_tests_hit."\n".$out;
}
close DAT;
}
sub read_ranges {
if (!-f 'tmp/ranges.data') {
system ("make tmp/ranges.data");
}
# read ranges, and mutableness, from ranges.data.
open (IN, "<tmp/ranges.data")
or die "need to run score-ranges-from-freqs first!";
my $count = 0;
while (<IN>) {
/^(\S+) (\S+) (\d+) (\S+)$/ or next;
my $t = $4;
$range_lo{$t} = $1+0;
$range_hi{$t} = $2+0;
my $mut = $3+0;
if ($allrules{$t}->{issubrule}) {
# warn "ignoring '$t': is sub-rule\n"; # no need to warn
$ignored_rule{$t} = 1;
$mutable_tests{$t} = 0;
next;
}
if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) {
warn "ignoring '$t': score and range == 0\n";
$ignored_rule{$t} = 1;
$mutable_tests{$t} = 0;
next;
}
$ignored_rule{$t} = 0;
$index_to_rule[$count] = $t;
$count++;
if (!$mut) {
$mutable_tests{$t} = 0;
} elsif ($range_lo{$t} == $range_hi{$t}) {
$mutable_tests{$t} = 0;
} elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
$mutable_tests{$t} = 0;
} else {
$mutable_tests{$t} = 1;
}
unless ($mutable_tests{$t} || $scores{$t}) {
warn "ignoring '$t': immutable and score == 0\n";
$ignored_rule{$t} = 1;
}
}
close IN;
# catch up on the ones missed; seems to be userconf or 0-hitters mostly.
foreach my $t (sort keys %allrules) {
next if (exists($range_lo{$t}));
if ($allrules{$t}->{issubrule}) {
if (!$ignored_rule{$t}) {
# warn "ignoring '$t': is sub-rule\n"; # no need to warn here
$ignored_rule{$t} = 1;
}
$mutable_tests{$t} = 0;
next;
}
$ignored_rule{$t} = 0;
unless (exists($mutable_tests{$t}) &&
($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) {
$mutable_tests{$t} = 0;
}
unless ($mutable_tests{$t} || $scores{$t}) {
if (!$ignored_rule{$t}) {
warn "ignoring '$t': immutable and score == 0\n";
$ignored_rule{$t} = 1;
}
}
$index_to_rule[$count] = $t;
$count++;
}
foreach my $t (keys %range_lo) {
next if ($ignored_rule{$t});
if ($mutable_tests{$t}) {
if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
$scores{$t} = -1;
} elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
$scores{$t} = -0.01;
}
if ($scores{$t} >= $range_hi{$t}) {
$scores{$t} = $range_hi{$t} - 0.001;
} elsif ($scores{$t} <= $range_lo{$t}) {
$scores{$t} = $range_lo{$t} + 0.001;
}
} else {
if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
next;
} elsif ($range_lo{$t} == $range_hi{$t}) {
$scores{$t} = $range_lo{$t};
next;
}
if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
$scores{$t} = -1;
} elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
$scores{$t} = -0.01;
}
if ($scores{$t} > $range_hi{$t}) {
$scores{$t} = $range_hi{$t};
} elsif ($scores{$t} < $range_lo{$t}) {
$scores{$t} = $range_lo{$t};
}
}
}
}
sub evaluate {
printf ("\n# SUMMARY for threshold %3.1f:\n", $threshold);
printf "# Correctly non-spam: %6d %4.2f%%\n",
$ga_nn, ($ga_nn / $num_ham) * 100.0;
printf "# Correctly spam: %6d %4.2f%%\n",
$ga_yy, ($ga_yy / $num_spam) * 100.0;
printf "# False positives: %6d %4.2f%%\n",
$ga_ny, ($ga_ny / $num_ham) * 100.0;
printf "# False negatives: %6d %4.2f%%\n",
$ga_yn, ($ga_yn / $num_spam) * 100.0;
# convert to the TCR metrics used in the published lit
my $nspamspam = $ga_yy;
my $nspamlegit = $ga_yn;
my $nlegitspam = $ga_ny;
my $nlegitlegit = $ga_yn;
my $nlegit = $num_ham;
my $nspam = $num_spam;
my $werr = ($lambda * $nlegitspam + $nspamlegit)
/ ($lambda * $nlegit + $nspam);
my $werr_base = $nspam
/ ($lambda * $nlegit + $nspam);
$werr ||= 0.000001; # avoid / by 0
my $tcr = $werr_base / $werr;
my $sr = ($nspamspam / $nspam) * 100.0;
my $sp = ($nspamspam / ($nspamspam + $nlegitspam)) * 100.0;
printf "# TCR(l=%s): %3.6f SpamRecall: %3.3f%% SpamPrec: %3.3f%%\n",
$lambda, $tcr, $sr, $sp;
}
__DATA__
void loadtests (void) {
FILE *fin = fopen ("tmp/tests.data", "r");
char buf[256];
int file = 0;
int tnum = 0;
while (fgets (buf, 255, fin) != NULL) {
char cmd;
long arg;
float argd;
cmd = (char) *buf;
arg = strtol (buf+1, NULL, 10);
argd = (float)strtod (buf+1, NULL);
if (cmd == '.') {
file = arg;
} else if (cmd == 'n') {
tnum = 0;
num_tests_hit[file] = arg;
} else if (cmd == 's') {
is_spam[file] = arg;
} else if (cmd == 'b') {
scores[file] = argd;
} else if (cmd == 't') {
tests_hit[file][tnum] = arg; tnum++;
} else if (cmd == 'c') {
tests_count[file] = arg;
}
}
fclose(fin);
printf ("Read test results for %d messages (%d total).\n", file+1,
num_tests);
}
void loadscores (void) {
FILE *fin = fopen ("tmp/scores.data", "r");
char buf[256];
int snum = 0;
while (fgets (buf, 255, fin) != NULL) {
char cmd;
long arg;
float argd;
char *str, *white;
cmd = (char) *buf;
arg = strtol (buf+1, NULL, 10);
argd = (float)strtod (buf+1, NULL);
str = buf+1;
while ((white = strchr (str, '\n')) != NULL) {
*white = '\0';
}
if (cmd == '.') {
snum = arg;
} else if (cmd == 'b') {
bestscores[snum] = argd;
} else if (cmd == 'l') {
range_lo[snum] = argd;
} else if (cmd == 'h') {
range_hi[snum] = argd;
} else if (cmd == 'n') {
score_names[snum] = strdup (str); /* leaky leak ;) */
} else if (cmd == 'm') {
is_mutable[snum] = arg;
}
}
fclose(fin);
printf ("Read scores for %d tests.\n", num_scores);
}