overlap [plain text]

#!/usr/bin/perl -w

# overlap - print overlap between test pairs
#
# <@LICENSE>
# Copyright 2004 Apache Software Foundation
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

use vars qw($opt_a $opt_h $opt_t);
use Getopt::Std;
getopts("aht");

my $prog = $0;
$prog =~ s@.*/@@;

sub usage {
    my $status = shift;

    my $out = $status ? STDERR : STDOUT;
    print $out <<EOF;
usage: $prog [options] [mass-check results files]

 -a    show all entries (normally, reverses of pairs are not shown)
 -h    print this help
 -t    ignore T_ tests

Do not abuse this tool.  Just because a test highly correlates with
another test does not mean you can simply remove one or merge them
without further consideration.  You need to also look at hit rates,
false positives, false negatives, and actually compare the tests.
Some overlap is often good, especially if the tests have different
characteristics.

EOF
    exit($status);
}

usage(0) if $opt_h;

if ($#ARGV < 0) {
    push(@ARGV, "-");
}

my %solo;
my %pair;

foreach $file (@ARGV) {
    read_file($file);
}

print "COUNT\tPAIR/A\tPAIR/B\tA,B\n";

foreach $k (sort { $pair{$b} <=> $pair{$a} } keys %pair) {
    my ($a, $b) = split(/ /, $k);
    my $a_pct = $pair{$k} / $solo{$a};
    my $b_pct = $pair{$k} / $solo{$b};
    if ($opt_a) {
	printf "%d\t%.3f\t%.3f\t%s,%s\n", $pair{$k},$a_pct,$b_pct,$a,$b;
	printf "%d\t%.3f\t%.3f\t%s,%s\n", $pair{$k},$b_pct,$a_pct,$b,$a;
    }
    else {
	if (($a_pct > $b_pct) || ($a_pct == $b_pct && $a lt $b)) {
	    printf "%d\t%.3f\t%.3f\t%s,%s\n", $pair{$k},$a_pct,$b_pct,$a,$b;
	}
	else {
	    printf "%d\t%.3f\t%.3f\t%s,%s\n", $pair{$k},$b_pct,$a_pct,$b,$a;
	}
    }
}

sub read_file {
    my ($input) = @_;

    open(FILE, $input) || die "open failed: $input";
    my $line = 0;
    while(<FILE>) {
	next if /^#/;
	if (/^[Y.]\s+-?\d+\s+\S+\s+(\S+)/) {
	    my @tests = split(/,/, $1);
	    @tests = grep { !/^T_/ } @tests if $opt_t;
	    my $i = 0;
	    for my $a (@tests) {
		$solo{$a}++;
		$pair{"$a $_"}++ for @tests[(++$i) .. $#tests];
	    }
	}
	else {
	    die "$prog: error in input format in $input\n";
	}
    }
    close(FILE);
}