use Statistics::Distributions;
use strict;
my $alpha = 0.95; my $lambda = 50;
if ( scalar(@ARGV) < 2 ) {
print STDERR "Usage: compare-models [validate1] [validate2]\n";
exit 1;
}
my (@fp1, @fn1, @tcr1);
open (FILE, $ARGV[0]) || die $!;
while (<FILE>) {
my @x = split(/\s+/);
push (@fp1, $x[2] / ($x[0] + $x[2]));
push (@fn1, $x[3] / ($x[1] + $x[3]));
push (@tcr1, $x[1] / ($x[3] + $lambda * $x[2]));
}
close (FILE);
my (@fp2, @fn2, @tcr2);
open (FILE, $ARGV[1]) || die $!;
while (<FILE>) {
my @x = split(/\s+/);
push (@fp2, $x[2] / ($x[0] + $x[2]));
push (@fn2, $x[3] / ($x[1] + $x[3]));
push (@tcr2, $x[1] / ($x[3] + $lambda * $x[2]));
}
close (FILE);
stat_analysis ("False positives", "pct", \@fp1, \@fp2);
stat_analysis ("False negatives", "pct", \@fn1, \@fn2);
stat_analysis ("TCR (lambda=$lambda)", "lin", \@tcr1, \@tcr2);
sub stat_analysis {
my $title = shift;
my $pct = shift;
my $s1 = shift;
my $s2 = shift;
unless ( scalar(@$s1) == scalar(@$s1) ) {
print STDERR "Can't compute stats for $title. Samples are not paired.\n";
return;
}
my $dof = scalar(@{$s1});
print "$title:\n";
my $mean_s1 = 0;
foreach my $i (1..$dof) {
$mean_s1 += $$s1[$i];
}
$mean_s1 /= $dof;
my $var_s1 = 0;
foreach my $i (1..$dof) {
$var_s1 += ($mean_s1 - $$s1[$i])**2;
}
$var_s1 /= $dof - 1;
my $std_s1 = sqrt($var_s1);
my $mean_s2 = 0;
foreach my $i (1..$dof) {
$mean_s2 += $$s2[$i];
}
$mean_s2 /= $dof;
my $var_s2 = 0;
foreach my $i (1..$dof) {
$var_s2 += ($mean_s2 - $$s2[$i])**2;
}
$var_s2 /= $dof - 1;
my $std_s2 = sqrt($var_s2);
if ( $pct eq "pct" ) {
printf "\tSample 1: mean=%0.4f%% std=%0.4f\n",100*$mean_s1,100*$std_s1;
printf "\tSample 2: mean=%0.4f%% std=%0.4f\n",100*$mean_s2,100*$std_s2;
} else {
printf "\tSample 1: mean=%0.4f std=%0.4f\n",$mean_s1,$std_s1;
printf "\tSample 2: mean=%0.4f std=%0.4f\n",$mean_s2,$std_s2;
}
my $mean_d = 0;
foreach my $i (1..$dof) {
$mean_d += $$s1[$i] - $$s2[$i];
}
$mean_d /= $dof;
my $var_d = 0;
foreach my $i (1..$dof) {
$var_d += ($mean_d - $$s1[$i] + $$s2[$i])**2;
}
$var_d /= $dof - 1;
my $std_d = sqrt($var_d);
my $tstat;
if ( $var_d > 0 ) {
$tstat = $mean_d / sqrt($var_d / $dof);
} else {
$tstat = 0;
}
my $tcrit = Statistics::Distributions::tdistr ($dof, (1-$alpha)/2);
my $tprob = 1-Statistics::Distributions::tprob ($dof, abs($tstat))/2;
if ( abs($tstat) < $tcrit ) {
printf "\tNot statistically significantly different (alpha=%0.4f)\n", $alpha;
} else {
printf "\tStatistically significantly different with confidence %0.4f%%\n", 100*$tprob;
}
if ( $pct eq "pct" ) {
printf "\tEstimated difference: %0.4f%% +/- %0.4f\n", 100*$mean_d, 100*$std_d*$tcrit;
} else {
printf "\tEstimated difference: %0.4f +/- %0.4f\n", $mean_d, $std_d*$tcrit;
}
print "\n";
}