20_body_tests.cf   [plain text]


# SpamAssassin rules file: body tests
#
# Please don't modify this file as your changes will be overwritten with
# the next update. Use @@LOCAL_RULES_DIR@@/local.cf instead.
# See 'perldoc Mail::SpamAssassin::Conf' for details.
#
# Note: body tests are run with long lines, so be sure to limit the
# size of searches; use /.{0,30}/ instead of /.*/ to avoid huge
# search times.
#
# Note: If you are adding a rule which looks for a phrase in the body
# (as most of them do), please add it to rules/20_phrases.cf instead.
#
# <@LICENSE>
# Copyright 2004 Apache Software Foundation
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
#
###########################################################################

require_version @@VERSION@@

###########################################################################
# GTUBE test - the generic test for UBE.
body GTUBE		/XJS\*C4JDBQADN1\.NSBN3\*2IDNEN\*GTUBE-STANDARD-ANTI-UBE-TEST-EMAIL\*C\.34X/
describe GTUBE		Generic Test for Unsolicited Bulk Email
tflags GTUBE		userconf noautolearn

###########################################################################
# Message digest tests

full RAZOR2_CHECK	eval:check_razor2()
describe RAZOR2_CHECK	Listed in Razor2 (http://razor.sf.net/)
tflags RAZOR2_CHECK	net

# cf (confidence level) is how likely the message is spam.  RAZOR2_CHECK
# returns true if cf>=min_cf (as defined by user/config).  These return
# true depending on what cf value the message has.  The algorithm goes:
# check the message via razor, then go through each mime part and check
# how razor scored it.  If the part is contested (ie: it's been reported
# as both ham and spam) it's ignored.  SA takes the highest non-contested
# part cf score and returns it for the range rules.  ie: This is essentially
# Razor 2's logic_method 4.
#
# Note: Disabling RAZOR2_CHECK (score RAZOR2_CHECK 0) will also disable
# these checks.
#
# Note: The scores are set to 0 on these tests right now until they get
# better integrated with SA overall.
#
body	RAZOR2_CF_RANGE_51_100	eval:check_razor2_range('51','100')
tflags  RAZOR2_CF_RANGE_51_100	net
describe RAZOR2_CF_RANGE_51_100	Razor2 gives confidence level above 50%

full DCC_CHECK		eval:check_dcc()
describe DCC_CHECK	Listed in DCC (http://rhyolite.com/anti-spam/dcc/)
tflags DCC_CHECK	net

full PYZOR_CHECK	eval:check_pyzor()
describe PYZOR_CHECK	Listed in Pyzor (http://pyzor.sf.net/)
tflags PYZOR_CHECK	net

# bug 2220. nice results
meta DIGEST_MULTIPLE       RAZOR2_CHECK + DCC_CHECK + PYZOR_CHECK > 1
describe DIGEST_MULTIPLE   Message hits more than one network digest check
tflags DIGEST_MULTIPLE     net

# this seems to be the new fashion (as of Jul 5 2002).  base64-encoded parts need to
# be stripped before this match
body TRACKER_ID		/^[a-z0-9]{6,24}[-_a-z0-9]{12,36}[a-z0-9]{6,24}\s*\z/is
describe TRACKER_ID	Incorporates a tracking ID number

body WEIRD_QUOTING	/[\042\223\224\262\263\271]{2}\S{0,16}[\042\223\224\262\263\271]{2}/
describe WEIRD_QUOTING	Weird repeated double-quotation marks

###########################################################################
# these tests doesn't actually use rawbody since rawbody isn't raw enough;
# they must be written very carefully to avoid modifying the original content

# MIME Content-Transfer-Encoding control rules
rawbody __MIME_BASE64		eval:check_for_mime('mime_base64_count')
describe __MIME_BASE64		Includes a base64 attachment

rawbody __MIME_QP		eval:check_for_mime('mime_qp_count')
describe __MIME_QP		Includes a quoted-printable attachment

rawbody MIME_BASE64_BLANKS	eval:check_for_mime('mime_base64_blanks')
describe MIME_BASE64_BLANKS	Extra blank lines in base64 encoding

rawbody MIME_BASE64_NO_NAME	eval:check_for_mime('mime_base64_no_name')
describe MIME_BASE64_NO_NAME	base64 attachment does not have a file name

rawbody MIME_BASE64_TEXT	eval:check_for_mime('mime_base64_encoded_text')
describe MIME_BASE64_TEXT	Message text disguised using base64 encoding

rawbody  MIME_MISSING_BOUNDARY	eval:check_for_mime('mime_missing_boundary')
describe MIME_MISSING_BOUNDARY	MIME section missing boundary

body MIME_HTML_MOSTLY		eval:check_mime_multipart_ratio('0.00','0.01')
describe MIME_HTML_MOSTLY	Multipart message mostly text/html MIME

# Steve Linford via Charlie Watts: good test!
body MIME_HTML_ONLY		eval:check_for_mime_html_only()
describe MIME_HTML_ONLY		Message only has text/html MIME parts

# multipart/alternative has very good accuracy, other multipart types are
# similar to MIME_HTML_ONLY so they don't need a separate rule
header __CTYPE_MULTIPART_ALT	Content-Type =~ /multipart\/alternative/i
meta MIME_HTML_ONLY_MULTI	(__CTYPE_MULTIPART_ALT && MIME_HTML_ONLY)
describe MIME_HTML_ONLY_MULTI	Multipart message only has text/html MIME parts

rawbody  MIME_QP_LONG_LINE	eval:check_for_mime('mime_qp_long_line')
describe MIME_QP_LONG_LINE	Quoted-printable line longer than 76 chars

# actually indicates viruses, typically; just used here to clean corpora.
rawbody  MIME_SUSPECT_NAME	eval:check_for_mime('mime_suspect_name')
describe MIME_SUSPECT_NAME	MIME filename does not match content
# todo: better tflags category for these tests
tflags MIME_SUSPECT_NAME userconf

# note: __HIGHBITS is used by HTML_CHARSET_FARAWAY
rawbody __MIME_CHARSET_FARAWAY	eval:check_for_mime('mime_faraway_charset')
body __HIGHBITS			/(?:[\x80-\xff].?){4,}/
meta MIME_CHARSET_FARAWAY	(__MIME_CHARSET_FARAWAY && __HIGHBITS)
describe MIME_CHARSET_FARAWAY	MIME character set indicates foreign language
tflags MIME_CHARSET_FARAWAY	userconf

# This rule uses a simple algorithm to determine if the text and html
# parts of an multipart/alternative message are different.
body MPART_ALT_DIFF	eval:multipart_alternative_difference('99', '100')
describe MPART_ALT_DIFF	HTML and text parts are different

###########################################################################

body CHARSET_FARAWAY		eval:check_for_faraway_charset()
describe CHARSET_FARAWAY	Character set indicates a foreign language
tflags CHARSET_FARAWAY		userconf

body UNWANTED_LANGUAGE_BODY	eval:check_language()
describe UNWANTED_LANGUAGE_BODY	Message written in an undesired language
tflags UNWANTED_LANGUAGE_BODY	userconf

body BODY_8BITS			eval:check_for_body_8bits()
describe BODY_8BITS		Body includes 8 consecutive 8-bit characters
tflags BODY_8BITS		userconf

# duncf
body EMAIL_ROT13     /\b[a-z(\]-]+\^[a-z-]+\([a-z]{2,3}\b/
describe EMAIL_ROT13 Body contains a ROT13-encoded email address
test EMAIL_ROT13 ok  qhabs^ebtref(pbz
test EMAIL_ROT13 ok  zxrggyre^riv-vap(pbz
test EMAIL_ROT13 fail	duncf-nospam@rogers.com

body BLANK_LINES_70_80	eval:check_blank_line_ratio('70','80','4')
body BLANK_LINES_80_90	eval:check_blank_line_ratio('80','90','4')
body BLANK_LINES_90_100	eval:check_blank_line_ratio('90','100','4')
describe BLANK_LINES_70_80  Message body has 70-80% blank lines
describe BLANK_LINES_80_90  Message body has 80-90% blank lines
describe BLANK_LINES_90_100 Message body has 90-100% blank lines

body UNIQUE_WORDS	eval:check_unique_words('0.946', '3.1')
describe UNIQUE_WORDS	Message body has many words used only once

body DOMAIN_RATIO	eval:check_domain_ratio('0.022')
describe DOMAIN_RATIO	Message body mentions many internet domains

# If these are too expensive as a whole, then delete __LONGWORDS_B and
# __LONGWORDS_C and replace with (__LONGWORDS_D || __LONGWORDS_A) which
# is very close in quality.
body __LONGWORDS_A	/\b(?:[a-z]{8,}\s+){6}/
body __LONGWORDS_B	/\b(?:[a-z]{7,}\s+){8}/
body __LONGWORDS_C	/\b(?:[a-z]{6,}\s+){9}/
body __LONGWORDS_D	/\b(?:[a-z]{5,}\s+){10}/
meta LONGWORDS		(__LONGWORDS_A || __LONGWORDS_B || __LONGWORDS_C || __LONGWORDS_D)
describe LONGWORDS	Long string of long words