2001-01-05 22:22:47 +01:00
|
|
|
|
# This file is derived from
|
|
|
|
|
#
|
|
|
|
|
# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
|
|
|
|
#
|
|
|
|
|
# Which was created by Markus Kuhn <mkuhn@acm.org> - 2000-09-02
|
|
|
|
|
#
|
|
|
|
|
# lines begining with # and blank lines are ignored
|
|
|
|
|
#
|
|
|
|
|
# Beyond that, this file consists of a series of test cases. Each test case consists of
|
|
|
|
|
# 2 or 3 lines:
|
|
|
|
|
#
|
|
|
|
|
# 1. A UTF-8 string
|
|
|
|
|
# 2. A status
|
|
|
|
|
# VALID : The string is a valid UTF-8 representation of valid Unicode
|
|
|
|
|
# INCOMPLETE : The string has a partial character at the end
|
|
|
|
|
# NOTUNICODE : The string is valid UTF-8, but the characters represented
|
|
|
|
|
# are not valid unicode (
|
|
|
|
|
# OVERLONG : The string includes overlong sequences
|
|
|
|
|
# MALFORMED : The string is not valid UTF-8
|
|
|
|
|
# 3. If the status is VALID or NOTUNICODE, the UCS-4 representation of the string,
|
|
|
|
|
# as a series of hex numbers.
|
|
|
|
|
|
|
|
|
|
# 1 Some correct UTF-8 text
|
|
|
|
|
κόσμε
|
|
|
|
|
VALID
|
|
|
|
|
03ba 1f79 03c3 03bc 03b5
|
|
|
|
|
|
|
|
|
|
# 2.1 First possible sequence of a certain length
|
|
|
|
|
#
|
|
|
|
|
# FIXME - handle NULLS?
|
|
|
|
|
#
|
|
|
|
|
# [ NULL BYTE ]
|
|
|
|
|
#VALID
|
|
|
|
|
#0000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
VALID
|
|
|
|
|
0080
|
|
|
|
|
|
|
|
|
|
ࠀ
|
|
|
|
|
VALID
|
|
|
|
|
0800
|
|
|
|
|
|
|
|
|
|
𐀀
|
|
|
|
|
VALID
|
|
|
|
|
00010000
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
00200000
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
04000000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
VALID
|
|
|
|
|
0000007f
|
|
|
|
|
|
|
|
|
|
߿
|
|
|
|
|
VALID
|
|
|
|
|
000007ff
|
|
|
|
|
|
|
|
|
|
|
2013-03-19 04:28:27 +01:00
|
|
|
|
VALID
|
2001-01-05 22:22:47 +01:00
|
|
|
|
0000ffff
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
001fffff
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
03ffffff
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
7fffffff
|
|
|
|
|
|
|
|
|
|
# 2.3 Other boundary conditions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
VALID
|
|
|
|
|
d7ff
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
VALID
|
|
|
|
|
e000
|
|
|
|
|
|
|
|
|
|
<EFBFBD>
|
|
|
|
|
VALID
|
|
|
|
|
fffd
|
|
|
|
|
|
2003-07-31 18:48:54 +02:00
|
|
|
|
|
2001-01-05 22:22:47 +01:00
|
|
|
|
VALID
|
2003-07-31 18:48:54 +02:00
|
|
|
|
0010fffd
|
|
|
|
|
|
|
|
|
|
|
2013-03-19 04:28:27 +01:00
|
|
|
|
VALID
|
2001-01-05 22:22:47 +01:00
|
|
|
|
0010ffff
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
00110000
|
|
|
|
|
|
|
|
|
|
# 3.1 Unexpected continuation bytes
|
|
|
|
|
|
|
|
|
|
<EFBFBD>
|
|
|
|
|
MALFORMED
|
|
|
|
|
<EFBFBD>
|
|
|
|
|
MALFORMED
|
|
|
|
|
<EFBFBD><EFBFBD>
|
|
|
|
|
MALFORMED
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
MALFORMED
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
MALFORMED
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
MALFORMED
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
MALFORMED
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
MALFORMED
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
MALFORMED
|
|
|
|
|
|
|
|
|
|
# 3.2 Lonely start characters
|
|
|
|
|
|
|
|
|
|
<EFBFBD> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20>
|
|
|
|
|
MALFORMED
|
|
|
|
|
<EFBFBD> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20> <20>
|
|
|
|
|
MALFORMED
|
|
|
|
|
<EFBFBD> <20> <20> <20> <20> <20> <20> <20>
|
|
|
|
|
MALFORMED
|
|
|
|
|
<EFBFBD> <20> <20> <20>
|
|
|
|
|
MALFORMED
|
|
|
|
|
<EFBFBD> <20>
|
|
|
|
|
MALFORMED
|
|
|
|
|
|
|
|
|
|
# 3.3 Sequences with last continuation byte missing
|
|
|
|
|
|
|
|
|
|
<EFBFBD>
|
|
|
|
|
INCOMPLETE
|
|
|
|
|
<EFBFBD><EFBFBD>
|
|
|
|
|
INCOMPLETE
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
INCOMPLETE
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
INCOMPLETE
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
INCOMPLETE
|
|
|
|
|
<EFBFBD>
|
|
|
|
|
INCOMPLETE
|
|
|
|
|
<EFBFBD><EFBFBD>
|
|
|
|
|
INCOMPLETE
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
INCOMPLETE
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
INCOMPLETE
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
INCOMPLETE
|
|
|
|
|
|
|
|
|
|
# 3.4 Concatenation of incomplete sequences
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
MALFORMED
|
|
|
|
|
|
|
|
|
|
# 3.5 Impossible bytes
|
|
|
|
|
|
|
|
|
|
<EFBFBD>
|
|
|
|
|
MALFORMED
|
|
|
|
|
<EFBFBD>
|
|
|
|
|
MALFORMED
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
MALFORMED
|
|
|
|
|
|
|
|
|
|
# Examples of an overlong ASCII character
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD>
|
|
|
|
|
OVERLONG
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
OVERLONG
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
OVERLONG
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
OVERLONG
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
OVERLONG
|
|
|
|
|
|
|
|
|
|
# Maximum overlong sequences
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD>
|
|
|
|
|
OVERLONG
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
OVERLONG
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
OVERLONG
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
OVERLONG
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
OVERLONG
|
|
|
|
|
|
|
|
|
|
# Overlong representation of the NUL character
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD>
|
|
|
|
|
OVERLONG
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
OVERLONG
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
OVERLONG
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
OVERLONG
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
OVERLONG
|
|
|
|
|
|
|
|
|
|
# Illegal code positions
|
|
|
|
|
|
|
|
|
|
# Single UTF-16 surrogates
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
d800
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
db7f
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
db80
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
dbff
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
dc00
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
df80
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
dfff
|
|
|
|
|
|
|
|
|
|
# Paired UTF-16 surrogates
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
d800 dc00
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
d800 dfff
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
db7f dc00
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
db7f dfff
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
db80 dc00
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
db80 dfff
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
dbff dc00
|
|
|
|
|
|
|
|
|
|
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
|
|
|
NOTUNICODE
|
|
|
|
|
dbff dfff
|
|
|
|
|
|
|
|
|
|
################
|
|
|
|
|
#
|
|
|
|
|
# Some more tests, not from Markus Kuhn's file
|
|
|
|
|
#
|
|
|
|
|
|
|
|
|
|
# Mixed plane 0 and higher planes
|
|
|
|
|
|
2003-07-31 18:48:54 +02:00
|
|
|
|
A𐀀BC
|
2001-01-05 22:22:47 +01:00
|
|
|
|
VALID
|
2003-07-31 18:48:54 +02:00
|
|
|
|
41 00010000 42 10fffd 43
|