true-false-pos-neg-filter.pl   [plain text]


#!/usr/bin/perl -w

use strict;
use warnings;
use vars qw(%scores);

my $threshold = 5;
my %is_spam = ();
my %id_spam = ();
my %lines = ();
my $cffile = "craig-evolve.scores";

print "Reading scores...";
readscores();
print "Reading logs...";
readlogs();
print "Sorting messages...";
sortmessages();

sub sortmessages {
    my ($yy,$nn,$yn,$ny) = (0,0,0,0);

    open(YY,">truepos.log");
    open(NN,">trueneg.log");
    open(YN,">falseneg.log");
    open(NY,">falsepos.log");

    for my $count (0..scalar(keys %lines)-1) {
	
	if($is_spam{$count})
	{
	    if($id_spam{$count})
	    {
		print YY $lines{$count};
		$yy++;
	    }
	    else
	    {
		print YN $lines{$count};
		$yn++;
	    }
	}
	else
	{
	    if($id_spam{$count})
	    {
		print NY $lines{$count};
		$ny++;
	    }
	    else
	    {
		print NN $lines{$count};
		$nn++;
	    }
	}
    }

    print "$yy,$nn,$yn,$ny\n";

    close YY;
    close NY;
    close YN;
    close NN;
}

sub readlogs {
    my $count = 0;

    foreach my $file ("spam.log", "nonspam.log") {
	open (IN, "<$file");

	while (<IN>) {
            next if /^#/;
	    my $this_line = $_;
	    /^.\s+(\d+)\s+\S+\s*/ or next;
	    my $hits = $1;

	    $_ = $'; #'closing quote for emacs coloring
	    s/,,+/,/g; s/^\s+//; s/\s+$//;
	    my $msg_score = 0;
	    foreach my $tst (split (/,/, $_)) {
		next if ($tst eq '');
		if (!defined $scores{$tst}) {
		    warn "unknown test in $file, ignored: $tst\n";
		    next;
		}
		$msg_score += $scores{$tst};
	    }

	    $lines{$count} = $this_line;
	    
	    if ($msg_score >= $threshold) {
		$id_spam{$count} = 1;
	    } else {
		$id_spam{$count} = 0;
	    }

	    if ($file eq "spam.log") {
		$is_spam{$count} = 1;
	    } else {
		$is_spam{$count} = 0;
	    }
	    $count++;
	} 
	close IN;
    }
    print "$count\n";
}

sub readscores {
    system ("./parse-rules-for-masses -d ../rules -d \"$cffile\"") and die;
    require "./tmp/rules.pl";
    print scalar(keys %scores),"\n";
}