fuzzy-hash-maildir   [plain text]


#!/usr/bin/perl
#
# Uses a fuzzy hash to find closely-related mails, which might only
# differ by an ID string somewhere or similar.  Consider it "Razor Lite" ;)
#
# usage: find-dups corpusdir/* > possible-dups.list
# or:    find-dups corpusdir1 [dir2 dir3 ...] > possible-dups.list

$DUMP_WITH_KEY = 0;	# TODO: use getopt
$COMMANDS = 1;

my $hashes2 = { };
my $hashes3 = { };
my $hashes4 = { };

use Digest::SHA1 qw(sha1_base64);
use File::Find;

my @files = ();
foreach my $file (@ARGV) {
  if (-d $file) {
    find (\&wanted, $file);
    sub wanted {
      (-f $_) and push (@files, $File::Find::name);
    }
  } else {
    push (@files, $file);
  }
}

foreach my $file (@files) {
  open (STDIN, "<$file") or warn "$file cannot be opened";
  my @hash = do_one();
  close STDIN;
  #print "$hash\t$file\n";

  my $hash2 = $hash[0].$hash[1];
  my $hash3 = $hash[0].$hash[1].$hash[2];
  my $hash4 = $hash[0].$hash[1].$hash[2].$hash[3];

  $hashes4->{$hash4} .= " ".$file;
  $hashes3->{$hash3} .= " ".$file;
  $hashes2->{$hash2} .= " ".$file;
}

check_collisions ($hashes4);
check_collisions ($hashes3);
check_collisions ($hashes2);
exit;

sub check_collisions {
  my ($db, $hash, $file) = @_;

  foreach $k (sort keys %{$db}) {
    $_ = $db->{$k};
    next unless (/\S \S/);
    s/^ //g;

    if ($DUMP_WITH_KEY) {
      print "$_  [$k]\n";	# to print the key
    } elsif ($COMMANDS) {
      my $count = 0;
      while (m/ /g) { $count++; }
      /^(\S+) (.*)$/;
      print "echo \"$1 : $count dups\"; rm -f $2\n";
    } else {
      print "$_\n";
    }

    delete $hashes4->{$k};
    delete $hashes3->{$k};
    delete $hashes2->{$k};
  }
}

sub do_one {
  while (<STDIN>) { /^$/ and last; }

  my $str = join ('', <STDIN>);

  # strip HTML tags, email addresses, queries
  # Add more strippings here if you like.
  $str =~ s/<[^>]+?>/ /igs;
  $str =~ s/"[^\"\s]+\?[^\"\s]+\"/ /igs;
  $str =~ s/\S+\?\S+/ /igs;
  $str =~ s/\S+\@\S+/ /igs;

  $str =~ s/TRCK:\S+//;

  $str =~ s/^[a-z0-9]{6,}[-_a-z0-9]{12,}[a-z0-9]{6,}\s*\z//is;
  $str =~ s/^\s*\S{24,}\s*\z//is;

#print $str;

  my @data = split (/\n/, $str);
  my $lpb = ($#data+1) / 4;

#warn "JMD $#lines $lpb";

  my @blks = ();
  push (@blks, join ('', splice (@data, 0, $lpb)));
  push (@blks, join ('', splice (@data, 0, $lpb)));
  push (@blks, join ('', splice (@data, 0, $lpb)));
  push (@blks, join ('', splice (@data, 0, $lpb)));

  my @ret = ();
  foreach my $blk (@blks) {
    #warn "JMD $blk";
    #my $digest = sprintf ("%05d", unpack ("%16C*", $blk));
    my $digest = sha1_base64($blk);
    push (@ret, $digest);
  }
  @ret;
}