score-ranges-from-freqs [plain text]
my $sliding_window_limits = 4.8; my $sliding_window_size = 5.5;
my $shrinking_window_lower_base = 0.00;
my $shrinking_window_lower_range = 1.00; my $shrinking_window_size_base = 1.00;
my $shrinking_window_size_range = 1.00;
my $use_sliding_window = 0;
my $argcffile = shift @ARGV;
my $scoreset = shift @ARGV;
$scoreset = 0 if ( !defined $scoreset );
if (defined ($argcffile) && $argcffile eq '-test') {
for $rat (0.0, 0.25, 0.5, 0.75, 1.0) {
my ($lo, $hi); if ($use_sliding_window) {
($lo, $hi) = sliding_window_ratio_to_range($rat);
} else {
($lo, $hi) = shrinking_window_ratio_to_range($rat);
}
warn "test: $rat => [ $lo $hi ]\n";
} exit;
}
my %freq_spam = ();
my %freq_nonspam = ();
my $num_spam;
my $num_nonspam;
my $num_total;
my %mutable_tests = ();
my %ranking = ();
my %soratio = ();
my %is_nice = ();
if (!defined $argcffile) { $argcffile = "../rules"; }
system ("./parse-rules-for-masses -d \"$argcffile\" -s $scoreset") and die;
if (-e "tmp/rules.pl") {
require "./tmp/rules.pl";
}
else {
die "parse-rules-for-masses had no error but no tmp/rules.pl!?!";
}
while (<>) {
/^\s*([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+([\d\.]+)\s+\S+\s+(.+)\s*$/ or next;
my $overall = $1+0;
my $spam = $2+0;
my $nonspam = $3+0;
my $soratio = $4+0;
my $ranking = $5+0;
my $test = $6;
if ($test eq '(all messages)') {
$num_spam = $spam;
$num_nonspam = $nonspam;
$num_total = $spam+$nonspam;
next;
}
next if ($test eq '(all messages as %)');
if (!defined ($rules{$test})) {
warn "rule $test no longer exists; ignoring\n";
next;
}
$freq{$test} = $overall;
$freq_spam{$test} = $spam;
$freq_nonspam{$test} = $nonspam;
my $tflags = $rules{$test}->{tflags}; $tflags ||= '';
if ($tflags =~ /\buserconf\b/ ||
( ($scoreset % 2) == 0 && $tflags =~ /\bnet\b/ ) ||
( ($scoreset % 2) == 1 && $tflags =~ /\blearn\b/ ))
{
$mutable_tests{$test} = 0;
} else {
$mutable_tests{$test} = 1;
}
if (!$rules{$test}->{mutable}) {
$mutable_tests{$test} = 0;
}
if ($tflags =~ m/\bnice\b/i) {
$is_nice{$test} = 1;
} else {
$is_nice{$test} = 0;
}
if ($overall < 0.01) {
$mutable_tests{$test} = 0;
$soratio{$test} = 0.5;
$ranking{$test} = 0.0;
$rules{$test}->{score} = 0;
} else {
$soratio{$test} = $soratio;
$ranking{$test} = $ranking;
}
}
if ( ! mkdir "tmp", 0755 ) {
warn "Couldn't create tmp directory!: $!\n";
}
open (OUT, ">tmp/ranges.data");
foreach my $test (sort { $ranking{$b} <=> $ranking{$a} } keys %freq) {
if (!defined ($rules{$test})) {
warn "no rule $test";
print OUT ("0 0 0 $test\n");
next;
}
my $overall = $freq{$test};
my $spam = $freq_spam{$test};
my $nonspam = $freq_nonspam{$test};
my $soratio = $soratio{$test};
my $ranking = $ranking{$test};
my $mutable = $mutable_tests{$test};
if (!$mutable || $rules{$test}->{score} == 0) {
printf OUT ("%3.3f %3.3f 0 $test\n",
$rules{$test}->{score},
$rules{$test}->{score});
next;
}
my ($lo, $hi);
if ($is_nice{$test}) {
$hi = 0;
$lo = $ranking{$test} * -4.5;
}
else {
$lo = 0;
$hi = $ranking{$test} * 4.5;
}
printf OUT ("%3.1f %3.1f $mutable $test\n", $lo, $hi);
}
close OUT;
exit;
sub sliding_window_ratio_to_range {
my $ratio = shift;
my $lo = -$sliding_window_limits + ($sliding_window_size * $ratio);
my $hi = +$sliding_window_limits - ($sliding_window_size * (1-$ratio));
if ($lo > $hi) { ($lo,$hi) = ($hi,$lo);
}
($lo, $hi);
}
sub shrinking_window_ratio_to_range {
my $ratio = shift;
my $is_nice = 0;
my $adjusted = ($ratio -.5) * 2; if ($adjusted < 0) { $is_nice = 1; $adjusted = -$adjusted; }
my $lower = $shrinking_window_lower_base
+ ($shrinking_window_lower_range * $adjusted);
my $range = $shrinking_window_size_base
+ ($shrinking_window_size_range * $adjusted);
my $lo = $lower;
my $hi = $lower + $range;
if ($is_nice) {
my $tmp = $hi; $hi = -$lo; $lo = -$tmp;
}
if ($lo > $hi) { ($lo,$hi) = ($hi,$lo);
}
($lo, $hi);
}