sent.txt   [plain text]


#
#   Copyright (C) 2002-2003, International Business Machines Corporation and others.
#       All Rights Reserved.
#
#   file:  sent.txt   
#
#   ICU Sentence Break Rules
#      See Unicode Standard Annex #29.
#      These rules are based on TR 29 version 4.0.0
#
    

#
# Character categories as defined in TR 29
#
$Sep     = [\u000a \u000d \u0085 \u2028 \u2029];
$Format  = [[:Format:]];
$Sp      = [[:Whitespace:] - $Sep];
$Lower   = [[:Lowercase:]];
$Upper   = [[:TitleCase_Letter:] [:Uppercase:]];
$OLetter = [[:Alphabetic:] [:name = HEBREW PUNCTUATION GERESH:] - [$Lower $Upper]];
$Numeric = [:LineBreak = Numeric:];

$ATerm = [.];  

$Term  = [\u0021 \u003F \u0589 \u061F \u06D4 \u0700 \u0701 \u0702 \u0964 \u1362
          \u1367 \u1368 \u104a \u104b \u166e \u1803 \u1809 \u203C \u203D \u2047 
          \u2048 \u2049 \u3002 \uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61];
          
$Close   = [[:Open_Punctuation:] [:Close_Punctuation:] [:Linebreak = Quotation:] -
           [[:name = HEBREW PUNCTUATION GERESH:] $ATerm $Term]];
           
           

# Define extended forms of the character classes,
#   incorporate grapheme cluster + format chars.

$Extend     = [[:Grapheme_Extend = TRUE:]]; 
$ATermEx    = $ATerm   $Extend* $Format*;
$NumericEx  = $Numeric $Extend* $Format*;
$UpperEx    = $Upper   $Extend* $Format*;
$TermEx     = $Term    $Extend* $Format*;

#
#  $SepSeq keeps together CRLF as a separator.  (CRLF is a grapheme cluster)
#
$SepSeq  = $Sep | \u000d\u000a;

# $InteriorChars are those that never trigger a following break.
$InteriorChars = [^$Term $ATerm $Sep];   #Note:  includes Extend and Format chars


# Rule 6.  Match an ATerm (.) that does not cause a break because a number immediately follows it.
$NumberFollows = $InteriorChars* $ATermEx $NumericEx;


# Rule 7.  $UppersSurround   Match a no-break sentence fragment containing a . surrounded by Uppers
$UppersSurround = $InteriorChars* $UpperEx $ATermEx $UpperEx;

# Rule 8   Matches a sentence fragment containing "." that should not cause a sentence break,
#          because a lower case word follows the period.
$LowerWordFollows  = $InteriorChars* $ATermEx $Close* $Sp* [^$OLetter $Upper $Lower $Sep]* $Lower;

# Rules 3, 9, 10, 11
#                       Matches a simple sentence, or the trailing part of a complex sentence,
#                       where a simple sentence contains no interior "."s.
$EndSequence       = $InteriorChars* ($TermEx | $ATermEx) $Close* $Sp* $SepSeq? |
                     $InteriorChars* $SepSeq?;



# Put them all together.  
($NumberFollows | $UppersSurround |  $LowerWordFollows)*  $EndSequence;

     
#
#  Reverse Rules
#
$EndGorp                  = ($Term | $ATerm | $Sep | $Close | $Extend | $Format | $Sp);
$RevEndSequence           = $EndGorp* $InteriorChars* $EndGorp* | $Sep [^$ATerm $Term]*;
$ReverseLowerWordFollows  = $Lower [^$OLetter $Upper $Lower $Sep]* $ATerm $InteriorChars*;
$ReverseUpperSurround     = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper $InteriorChars*;
$ReverseNumberFollows     = $Numeric $Format* $Extend* $ATerm $InteriorChars*;

! $RevEndSequence ($ReverseLowerWordFollows | $ReverseUpperSurround | $ReverseNumberFollows)* .?;
#! .*;