split-log-into-buckets-random   [plain text]


#!/usr/bin/perl
#
# split-log-into-buckets [n]
#
# Split a mass-check log into n identically-sized buckets, evenly
# taking messages from all checked corpora and preserving comments.
# It does this evenly by running through all buckets sequentially
# as each line is read.  n defaults to 10

my $numbuckets = 0;
if (defined $ARGV[0]) {
  $numbuckets = $ARGV[0]+0;
}
$numbuckets ||= 10;

my %buckets = ();
foreach my $i (1 .. $numbuckets) {
  print "Creating split-$i.log\n";
  open ($buckets{$i}, ">split-$i.log");
}

while (<STDIN>) {
  select $buckets{1+int(rand()*$numbuckets)}; print $_;
}

foreach my $i (1 .. $numbuckets) {
  close $buckets{$i};
}