runGA   [plain text]


#!/bin/sh

# set SCORESET
. config

NAME="set$SCORESET"
LOGDIR="gen-$NAME-$HAM_PREFERENCE-$THRESHOLD-$EPOCHS"

if [ "$NOTE" != "" ]; then
	LOGDIR="$LOGDIR-$NOTE"
fi

if [ ! -f "ORIG/ham-$NAME.log" -o ! -f "ORIG/spam-$NAME.log" ]; then
	echo "Couldn't find logs for $NAME" >&2
	exit 1
fi

if [ "x$1" = "x" ]; then
# This should be in here instead.  Prevents testing.
svn revert ../rules/50_scores.cf

echo "[Doing a scoreset $SCORESET score-generation run]"

# Clean out old runs
echo "[Cleaning up]"
rm -rf spam-test.log ham-test.log spam.log ham.log \
	NSBASE SPBASE tmp make.output freqs perceptron.scores \
	$LOGDIR
make clean >/dev/null

# Create a directory to organize the logs with this group of settings
mkdir $LOGDIR

# Generate 90/10 split logs
echo "[Generating 90/10 split ham]"
mkdir $LOGDIR/NSBASE $LOGDIR/SPBASE
cd $LOGDIR/NSBASE
../../tenpass/split-log-into-buckets-random 10 < ../../ORIG/ham-$NAME.log > /dev/null
cat split-[1-9].log > ham.log
rm -f split-[1-9].log
mv split-10.log ham-test.log

echo "[Generating 90/10 split spam]"
cd ../SPBASE
../../tenpass/split-log-into-buckets-random 10 < ../../ORIG/spam-$NAME.log > /dev/null
cat split-[1-9].log > spam.log
rm -f split-[1-9].log
mv split-10.log spam-test.log
cd ../..

echo "[Setting up for gen run]"
# Ok, setup for a run
ln -s $LOGDIR/SPBASE/spam.log .
ln -s $LOGDIR/NSBASE/ham.log .
ln -s $LOGDIR/SPBASE/spam-test.log .
ln -s $LOGDIR/NSBASE/ham-test.log .

# try to find number of processors
numcpus=`cpucount 2>/dev/null || egrep -c '^processor\b' /proc/cpuinfo 2>/dev/null || echo 1`

echo "[Generating perceptron]"
# Generate perceptron with full logs
make -j $numcpus SCORESET=$SCORESET > $LOGDIR/make.output 2>&1

(
echo "[config]"
cat config
echo "[gen run start]"
pwd
date
./perceptron -p $HAM_PREFERENCE -t $THRESHOLD -e $EPOCHS
mv perceptron.scores $LOGDIR/scores
echo "[gen run end]"
) | tee $LOGDIR/log
svn revert ../rules/50_scores.cf
./rewrite-cf-with-new-scores $SCORESET ../rules/50_scores.cf $LOGDIR/scores > /tmp/runGA.$$
mv /tmp/runGA.$$ ../rules/50_scores.cf
./fp-fn-statistics --ham ham-test.log --spam spam-test.log --scoreset $SCORESET --fnlog $LOGDIR/false_negatives --fplog $LOGDIR/false_positives > $LOGDIR/test

else

# This needs to have 50_scores.cf in place first ...
echo "[gen test results]"
./logs-to-c --spam=spam-test.log \
	--ham=ham-test.log \
	--count --cffile=../rules --scoreset=$SCORESET | tee $LOGDIR/test

echo "[STATISTICS file generation]"
./mk-baseline-results $SCORESET | tee $LOGDIR/statistics
fi

exit 0