glib/tests/utf8.txt

298 lines
3.2 KiB
Plaintext
Raw Normal View History

# This file is derived from
#
# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
#
# Which was created by Markus Kuhn <mkuhn@acm.org> - 2000-09-02
#
# lines begining with # and blank lines are ignored
#
# Beyond that, this file consists of a series of test cases. Each test case consists of
# 2 or 3 lines:
#
# 1. A UTF-8 string
# 2. A status
# VALID : The string is a valid UTF-8 representation of valid Unicode
# INCOMPLETE : The string has a partial character at the end
# NOTUNICODE : The string is valid UTF-8, but the characters represented
# are not valid unicode (
# OVERLONG : The string includes overlong sequences
# MALFORMED : The string is not valid UTF-8
# 3. If the status is VALID or NOTUNICODE, the UCS-4 representation of the string,
# as a series of hex numbers.
# 1 Some correct UTF-8 text
κόσμε
VALID
03ba 1f79 03c3 03bc 03b5
# 2.1 First possible sequence of a certain length
#
# FIXME - handle NULLS?
#
# [ NULL BYTE ]
#VALID
#0000
€
VALID
0080
VALID
0800
𐀀
VALID
00010000
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
00200000
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
04000000

VALID
0000007f
߿
VALID
000007ff
￿
NOTUNICODE
0000ffff
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
001fffff
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
03ffffff
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
7fffffff
# 2.3 Other boundary conditions
VALID
d7ff
VALID
e000
<EFBFBD>
VALID
fffd
􏿿
VALID
0010ffff
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
00110000
# 3.1 Unexpected continuation bytes
<EFBFBD>
MALFORMED
<EFBFBD>
MALFORMED
<EFBFBD><EFBFBD>
MALFORMED
<EFBFBD><EFBFBD><EFBFBD>
MALFORMED
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
MALFORMED
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
MALFORMED
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
MALFORMED
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
MALFORMED
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
MALFORMED
# 3.2 Lonely start characters
<EFBFBD> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20>
MALFORMED
<EFBFBD> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20>
MALFORMED
<EFBFBD> <20> <20> <20> <20> <20> <20> <20>
MALFORMED
<EFBFBD> <20> <20> <20>
MALFORMED
<EFBFBD> <20>
MALFORMED
# 3.3 Sequences with last continuation byte missing
<EFBFBD>
INCOMPLETE
<EFBFBD><EFBFBD>
INCOMPLETE
<EFBFBD><EFBFBD><EFBFBD>
INCOMPLETE
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
INCOMPLETE
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
INCOMPLETE
<EFBFBD>
INCOMPLETE
<EFBFBD><EFBFBD>
INCOMPLETE
<EFBFBD><EFBFBD><EFBFBD>
INCOMPLETE
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
INCOMPLETE
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
INCOMPLETE
# 3.4 Concatenation of incomplete sequences
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
MALFORMED
# 3.5 Impossible bytes
<EFBFBD>
MALFORMED
<EFBFBD>
MALFORMED
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
MALFORMED
# Examples of an overlong ASCII character
<EFBFBD><EFBFBD>
OVERLONG
<EFBFBD><EFBFBD><EFBFBD>
OVERLONG
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
OVERLONG
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
OVERLONG
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
OVERLONG
# Maximum overlong sequences
<EFBFBD><EFBFBD>
OVERLONG
<EFBFBD><EFBFBD><EFBFBD>
OVERLONG
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
OVERLONG
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
OVERLONG
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
OVERLONG
# Overlong representation of the NUL character
<EFBFBD><EFBFBD>
OVERLONG
<EFBFBD><EFBFBD><EFBFBD>
OVERLONG
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
OVERLONG
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
OVERLONG
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
OVERLONG
# Illegal code positions
# Single UTF-16 surrogates
<EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
d800
<EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
db7f
<EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
db80
<EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
dbff
<EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
dc00
<EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
df80
<EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
dfff
# Paired UTF-16 surrogates
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
d800 dc00
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
d800 dfff
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
db7f dc00
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
db7f dfff
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
db80 dc00
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
db80 dfff
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
dbff dc00
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
NOTUNICODE
dbff dfff
# Other illegal code positions
NOTUNICODE
fffe
￿
NOTUNICODE
ffff
################
#
# Some more tests, not from Markus Kuhn's file
#
# Mixed plane 0 and higher planes
A𐀀B􏿿C
VALID
41 00010000 42 10ffff 43