freqdiff   [plain text]


#!/usr/bin/perl -w
#
# freqdiff - print frequency difference between two inputs
#
# <@LICENSE>
# Copyright 2004 Apache Software Foundation
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

use vars qw($opt_a $opt_b $opt_c $opt_d $opt_h $opt_p $opt_r $opt_l);
use Getopt::Std;
getopts("abcdhprl:");

my $prog = $0;
$prog =~ s@.*/@@;

sub usage {
    my $status = shift;

    my $out = $status ? STDERR : STDOUT;
    print $out <<EOF;
usage: $prog [options] [first input] [second input]
       $prog [options] [first input] [second input] [scores file]

 -a    show all elements (only used for -d)
 -b    show both count and percentage
 -c    remove comments from inputs
 -d    print delta of frequencies (second - first)
 -h    print this help
 -p    print percentage of frequencies ((first / (first + second)) * 100.0)
 -r    use relative frequencies (account for size of inputs, useful for -p)
 -l N  only read first N lines

default is -d if input line counts are within 1% of each other and -p otherwise
EOF
    exit($status);
}

usage(0) if $opt_h;
usage(1) unless $#ARGV > 0;

my @line;
my $type;
my %one = read_argv(0);
my %two = read_argv(1);

$opt_d = 1 if (!$opt_p && (abs($line[0] - $line[1]) / $line[1]) < 0.01);
$opt_d = 1 if ($type == 3);
$opt_a = 1 if ($type == 4);

my %score;
if ($#ARGV > 1) {
    open(FILE, $ARGV[2]) || die "open failed: $ARGV[2]";
    while (<FILE>) {
	chomp;
	s/#.*//;
	my @field = split;
	if ($#field >= 2 && $field[0] eq "score") {
	    $score{$field[1]} = $field[2];
	}
    }
    close(FILE);
}

my @all = (keys %one);
foreach my $elem (keys %two) {
    if (! defined($one{$elem})) {
	push(@all, $elem);
    }
}

my %out;
foreach my $elem (@all) {
    my $one = 0;
    my $two = 0;

    if ($type == 3) {
	$one = 1.0;
	$two = 1.0;
    }
    if (exists($one{$elem})) {
	$one = $one{$elem};
	delete $one{$elem};
    }
    if (exists($two{$elem})) {
	$two = $two{$elem};
	delete $two{$elem};
    }
    if ($opt_d) {
	$out{$elem} = $two - $one;
    }
    else {
	$count{$elem} = $two + $one if $opt_b;
	$out{$elem} = $one / ($one + $two) * 100.0;
    }
}

foreach my $elem (sort { $out{$b} <=> $out{$a} || ((defined %count) && ($count{$b} <=> $count{$a})) || $a cmp $b } keys %out) {
    my $name = $elem;
    if (%score) {
	if (exists($score{$elem})) {
	    $name = "$score{$elem}\t$name";
	}
	else {
	    $name = "1.0\t$name";
	}
    }
    if ($type == 3) {
	printf "%.3f\t%s\n", $out{$elem}, $name;
    }
    elsif ($opt_d) {
	print "$out{$elem}\t$name\n" if ($out{$elem} || $opt_a);
    }
    else {
	if ($opt_b) {
	    printf "%.2f\t%d\t%s\n", $out{$elem}, $count{$elem}, $name;
	}
	else {
	    printf "%.2f\t%s\n", $out{$elem}, $name;
	}
    }
}

sub read_argv {
    my ($input) = @_;
    my %freq;
    my $last = 0;

    open(FILE, $ARGV[$input]) || die "open failed: $ARGV[$input]";
    my $line = 0;
    while(<FILE>) {
	if ($opt_c) {
	    s/#.*//;
	    next unless /\S/;
	}
	next if (/^OVERALL/);	# hit-frequencies header line

	$line++;
	next if ($opt_l && $line > $opt_l);
	$last = $type;
	# "sort | uniq -c" format
	if (/^\s*(-?\d+|-?\d+\.\d+)\s+(.*)/) {
	    $type = 1;
	    $freq{$2} = $1;
	}
	# "mass-check" format
	elsif (/^[Y.]\s+-?\d+\s+\S+\s+(\S+)/) {
	    $type = 2;
	    foreach (split(/,/, $1)) {
		$freq{$_}++;
	    }
	}
	# "scores" format
	elsif (/^score\s+(\S+)\s+(-?[\d.]+)/) {
	    $type = 3;
	    $freq{$1} = $2;
	}
	# line number is frequency
	else {
	    $type = 4;
	    chomp;
	    $freq{$_} = $line;
	}
	if ($last && $last != $type) {
	    die "$prog: inconsistent format in $ARGV[$input] (format $last then format $type)\n";
	}
    }
    close(FILE);

    foreach my $key (keys %freq) {
	if ($type == 4) {
	    $freq{$key} = $line - $freq{$key} + 1;
	}
	if ($opt_r) {
	    $freq{$key} /= $line;
	}
    }
    $line[$input] = $line;
    return %freq;
}