run-corpora   [plain text]


#!/bin/sh

#
# This is a small script used to interact with run-masses to do a full
# corpus mass-check run, including the rsync to the SA server.
# Change the appropriate variables below.
#
# By default, it'll do a set0 run, but you can change that by adding
# --net or --bayes to the commandline.
#
# --net by itself will automatically try running 4 mass-checks in parallel
#

CORPUS=/home/felicity/SA/corpus
SA_VER=/home/felicity/SA/spamassassin-corpora
SVN=/usr/local/bin/svn
SVNVERS=/usr/local/bin/svnversion

NET=0
BAYES=0
OPTS="--progress --after=-2592000"
RSYNC_USER=your_rsync_username
RSYNC_PASSWORD="your_rsync_password"; export RSYNC_PASSWORD
VERS=nightly
FILENAME=$RSYNC_USER

while [ ! -z "$1" ]; do
  if [ "$1" = "--net" ]; then
    NET=1
  elif [ "$1" = "--bayes" ]; then
    BAYES=1
  fi
  shift
done

if [ $NET -eq 1 ]; then
  FILENAME="net-$FILENAME"
  OPTS="$OPTS --net"
  VERS=weekly

  # We want to do this with more parallelization, but not if Bayes is also running ...
  if [ $BAYES -eq 0 ]; then
    OPTS="$OPTS -j 4 --restart 1000"
  fi
fi
if [ $BAYES -eq 1 ]; then
  FILENAME="bayes-$FILENAME"
  OPTS="$OPTS --bayes"
fi

# Update SA version before our run
echo "[Updating $SA_VER]"
cd $SA_VER
COUNT=0
while ! wget -q -nd -m http://rsync.spamassassin.org/$VERS-versions.txt ; do
  sleep 60
  COUNT=`expr $COUNT + 1`
  if [ $COUNT -gt 5 ]; then
    echo "Couldn't get the $VERS revision version, aborting!" >&2
    exit 2
  fi
done

CREV=`$SVNVERS .`
NREV=`tail -1 $VERS-versions.txt | awk '{print $2}'`

if [ $CREV -ge $NREV ]; then
  echo "Current rev ($CREV) newer or equal to nightly rev ($NREV)"
  exit 0
fi

COUNT=0
while ! $SVN update -r $NREV; do
  sleep 60
  COUNT=`expr $COUNT + 1`
  if [ $COUNT -gt 5 ]; then
    echo "Couldn't do a SVN update, aborting!" >&2
    exit 2
  fi
done

# update the corpus with the latest/greatest mail files
echo "[Updating Corpus]"
cd $CORPUS
$CORPUS/update -q

# remove current bayes db set
echo "[Removing old Bayes DB]"
rm -f $SA_VER/masses/spamassassin/bayes*

# do the run
echo "[Running mass-check '$OPTS' in $CORPUS]"
$CORPUS/run-masses $SA_VER $OPTS > /dev/null

if [ ! -s ham.log -o ! -s spam.log ]; then
	echo "There seems to be a problem with either ham.log or spam.log, aborting!" >&2
	exit 1
fi

mv -f ham.log results/ham-$FILENAME.log
mv -f spam.log results/spam-$FILENAME.log
mv -f results.log results/hf/results-$FILENAME.log

cd results
# now we have our ham.log and spam.log files...
echo "[Uploading daily corpus logs]"
rsync -qCPcvuzb *-$FILENAME.log $RSYNC_USER@rsync.spamassassin.org::corpus/

echo "[Our results]"
cat hf/results-$FILENAME.log