utf8.txt   [plain text]


# This file is derived from 
#
#    http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
#    
# Which was created by   Markus Kuhn <mkuhn@acm.org> - 2000-09-02 
#
# lines begining with # and blank lines are ignored
#
# Beyond that, this file consists of a series of test cases. Each test case consists of
# 2 or 3 lines:
#
#  1. A UTF-8 string
#  2. A status
#      VALID      : The string is a valid UTF-8 representation of valid Unicode
#      INCOMPLETE : The string has a partial character at the end
#      NOTUNICODE : The string is valid UTF-8, but the characters represented
#                   are not valid unicode (
#      OVERLONG   : The string includes overlong sequences
#      MALFORMED  : The string is not valid UTF-8
# 3. If the status is VALID or NOTUNICODE, the UCS-4 representation of the string,
#    as a series of hex numbers.

# 1  Some correct UTF-8 text
κόσμε
VALID
03ba 1f79 03c3 03bc 03b5

# 2.1  First possible sequence of a certain length
#
# FIXME - handle NULLS?
#
# [ NULL BYTE ]
#VALID
#0000

€
VALID
0080

ࠀ
VALID
0800

𐀀
VALID
00010000


NOTUNICODE
00200000


NOTUNICODE
04000000


VALID
0000007f

߿
VALID
000007ff

￿
NOTUNICODE
0000ffff


NOTUNICODE
001fffff


NOTUNICODE
03ffffff


NOTUNICODE
7fffffff

# 2.3  Other boundary conditions

퟿
VALID
d7ff


VALID
e000

�
VALID
fffd

􏿽
VALID
0010fffd

􏿿
NOTUNICODE
0010ffff


NOTUNICODE
00110000

# 3.1  Unexpected continuation bytes


MALFORMED

MALFORMED

MALFORMED

MALFORMED

MALFORMED

MALFORMED

MALFORMED

MALFORMED

MALFORMED

# 3.2  Lonely start characters

                                
MALFORMED
                
MALFORMED
        
MALFORMED
    
MALFORMED
  
MALFORMED

# 3.3  Sequences with last continuation byte missing


INCOMPLETE

INCOMPLETE

INCOMPLETE

INCOMPLETE

INCOMPLETE

INCOMPLETE

INCOMPLETE

INCOMPLETE

INCOMPLETE

INCOMPLETE

# 3.4  Concatenation of incomplete sequences


MALFORMED

# 3.5  Impossible bytes


MALFORMED

MALFORMED

MALFORMED

#  Examples of an overlong ASCII character


OVERLONG

OVERLONG

OVERLONG

OVERLONG

OVERLONG

#  Maximum overlong sequences


OVERLONG

OVERLONG

OVERLONG

OVERLONG

OVERLONG

# Overlong representation of the NUL character


OVERLONG

OVERLONG

OVERLONG

OVERLONG

OVERLONG

# Illegal code positions

# Single UTF-16 surrogates


NOTUNICODE
d800


NOTUNICODE
db7f


NOTUNICODE
db80


NOTUNICODE
dbff


NOTUNICODE
dc00


NOTUNICODE
df80


NOTUNICODE
dfff

# Paired UTF-16 surrogates


NOTUNICODE
d800 dc00


NOTUNICODE
d800 dfff


NOTUNICODE
db7f dc00


NOTUNICODE
db7f dfff


NOTUNICODE
db80 dc00


NOTUNICODE
db80 dfff


NOTUNICODE
dbff dc00


NOTUNICODE
dbff dfff

# Other illegal code positions

￾
NOTUNICODE
fffe

￿
NOTUNICODE
ffff

################
#
# Some more tests, not from Markus Kuhn's file
#

# Mixed plane 0 and higher planes

A𐀀B􏿽C
VALID
41 00010000 42 10fffd 43