package Mail::SpamAssassin::TextCat;
use strict;
use bytes;
use vars qw(
$opt_a $opt_f $opt_t $opt_u
);
my @nm;
$opt_a = 10;
$opt_f = 0;
$opt_t = 400;
$opt_u = 1.05;
sub classify {
my ($self, $inputptr, $languages_filename) = @_;
my %results;
my $maxp = $opt_t;
my @unknown = create_lm($inputptr);
if (! @nm) {
my @lm;
my $ngram = {};
my $rang = 1;
dbg("Loading languages file...");
if (!defined $languages_filename) {
return;
}
open(LM, $languages_filename)
|| die "cannot open languages: $!\n";
local $/ = undef;
@lm = split(/\n/, <LM>);
close(LM);
for (@lm) {
if (/^0 (.+)/) {
$ngram->{"language"} = $1;
push(@nm, $ngram);
$ngram = {};
$rang = 1;
}
else {
$ngram->{$_} = $rang++;
}
}
}
foreach my $ngram (@nm) {
my $language = $ngram->{"language"};
my $i = 0;
my $p = 0;
for (@unknown) {
$p += exists($ngram->{$_}) ? abs($ngram->{$_} - $i) : $maxp;
$i++;
}
$results{$language} = $p;
}
my @results = sort { $results{$a} <=> $results{$b} } keys %results;
my $best = $results{$results[0]};
my @answers=(shift(@results));
while (@results && $results{$results[0]} < ($opt_u * $best)) {
@answers=(@answers, shift(@results));
}
if (@answers > $opt_a) {
dbg("Can't determine language uniquely enough");
return ();
}
else {
dbg("Language possibly: ".join(",",@answers));
return @answers;
}
}
sub create_lm {
my %ngram;
my @sorted;
for my $word (split(/[0-9\s]+/, ${$_[0]}))
{
$word = "\000" . $word . "\000";
my $len = length($word);
my $flen = $len;
my $i;
for ($i = 0; $i < $flen; $i++) {
$len--;
$ngram{substr($word, $i, 1)}++;
($len < 1) ? next : $ngram{substr($word, $i, 2)}++;
($len < 2) ? next : $ngram{substr($word, $i, 3)}++;
($len < 3) ? next : $ngram{substr($word, $i, 4)}++;
if ($len > 3) { $ngram{substr($word, $i, 5)}++ };
}
}
if ($opt_f > 0) {
@sorted = sort { $ngram{$b} <=> $ngram{$a} }
(grep { $ngram{$_} > $opt_f } keys %ngram);
}
else {
@sorted = sort { $ngram{$b} <=> $ngram{$a} } keys %ngram;
}
splice(@sorted, $opt_t) if (@sorted > $opt_t);
return @sorted;
}
sub dbg { Mail::SpamAssassin::dbg (@_); }
1;