mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2025-01-15 16:56:14 +01:00
3b93b01219
2003-07-31 Noah Levitt <nlevitt@columbia.edu> * tests/utf8.txt: Change instances of U+10ffff to U+10fffd, since that is the last valid unicode character. Add check that U+10ffff is NOTUNICODE. (#118730)
302 lines
3.2 KiB
Plaintext
302 lines
3.2 KiB
Plaintext
# This file is derived from
|
||
#
|
||
# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
||
#
|
||
# Which was created by Markus Kuhn <mkuhn@acm.org> - 2000-09-02
|
||
#
|
||
# lines begining with # and blank lines are ignored
|
||
#
|
||
# Beyond that, this file consists of a series of test cases. Each test case consists of
|
||
# 2 or 3 lines:
|
||
#
|
||
# 1. A UTF-8 string
|
||
# 2. A status
|
||
# VALID : The string is a valid UTF-8 representation of valid Unicode
|
||
# INCOMPLETE : The string has a partial character at the end
|
||
# NOTUNICODE : The string is valid UTF-8, but the characters represented
|
||
# are not valid unicode (
|
||
# OVERLONG : The string includes overlong sequences
|
||
# MALFORMED : The string is not valid UTF-8
|
||
# 3. If the status is VALID or NOTUNICODE, the UCS-4 representation of the string,
|
||
# as a series of hex numbers.
|
||
|
||
# 1 Some correct UTF-8 text
|
||
κόσμε
|
||
VALID
|
||
03ba 1f79 03c3 03bc 03b5
|
||
|
||
# 2.1 First possible sequence of a certain length
|
||
#
|
||
# FIXME - handle NULLS?
|
||
#
|
||
# [ NULL BYTE ]
|
||
#VALID
|
||
#0000
|
||
|
||
€
|
||
VALID
|
||
0080
|
||
|
||
à €
|
||
VALID
|
||
0800
|
||
|
||
ð<EFBFBD>€€
|
||
VALID
|
||
00010000
|
||
|
||
øˆ€€€
|
||
NOTUNICODE
|
||
00200000
|
||
|
||
ü„€€€€
|
||
NOTUNICODE
|
||
04000000
|
||
|
||
|
||
VALID
|
||
0000007f
|
||
|
||
ß¿
|
||
VALID
|
||
000007ff
|
||
|
||
ï¿¿
|
||
NOTUNICODE
|
||
0000ffff
|
||
|
||
÷¿¿¿
|
||
NOTUNICODE
|
||
001fffff
|
||
|
||
û¿¿¿¿
|
||
NOTUNICODE
|
||
03ffffff
|
||
|
||
ý¿¿¿¿¿
|
||
NOTUNICODE
|
||
7fffffff
|
||
|
||
# 2.3 Other boundary conditions
|
||
|
||
퟿
|
||
VALID
|
||
d7ff
|
||
|
||

|
||
VALID
|
||
e000
|
||
|
||
�
|
||
VALID
|
||
fffd
|
||
|
||
ô<EFBFBD>¿½
|
||
VALID
|
||
0010fffd
|
||
|
||
ô<EFBFBD>¿¿
|
||
NOTUNICODE
|
||
0010ffff
|
||
|
||
ô<EFBFBD>€€
|
||
NOTUNICODE
|
||
00110000
|
||
|
||
# 3.1 Unexpected continuation bytes
|
||
|
||
€
|
||
MALFORMED
|
||
¿
|
||
MALFORMED
|
||
€¿
|
||
MALFORMED
|
||
€¿€
|
||
MALFORMED
|
||
€¿€¿
|
||
MALFORMED
|
||
€¿€¿€
|
||
MALFORMED
|
||
€¿€¿€¿
|
||
MALFORMED
|
||
€¿€¿€¿€
|
||
MALFORMED
|
||
€<EFBFBD>‚ƒ„…†‡ˆ‰Š‹Œ<EFBFBD>Ž<EFBFBD><EFBFBD>‘’“”•–—˜™š›œ<EFBFBD>žŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿
|
||
MALFORMED
|
||
|
||
# 3.2 Lonely start characters
|
||
|
||
À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß
|
||
MALFORMED
|
||
à á â ã ä å æ ç è é ê ë ì í î ï
|
||
MALFORMED
|
||
ð ñ ò ó ô õ ö ÷
|
||
MALFORMED
|
||
ø ù ú û
|
||
MALFORMED
|
||
ü ý
|
||
MALFORMED
|
||
|
||
# 3.3 Sequences with last continuation byte missing
|
||
|
||
À
|
||
INCOMPLETE
|
||
à€
|
||
INCOMPLETE
|
||
ð€€
|
||
INCOMPLETE
|
||
ø€€€
|
||
INCOMPLETE
|
||
ü€€€€
|
||
INCOMPLETE
|
||
ß
|
||
INCOMPLETE
|
||
ï¿
|
||
INCOMPLETE
|
||
÷¿¿
|
||
INCOMPLETE
|
||
û¿¿¿
|
||
INCOMPLETE
|
||
ý¿¿¿¿
|
||
INCOMPLETE
|
||
|
||
# 3.4 Concatenation of incomplete sequences
|
||
|
||
Àà€ð€€ø€€€ü€€€€ßï¿÷¿¿û¿¿¿ý¿¿¿¿
|
||
MALFORMED
|
||
|
||
# 3.5 Impossible bytes
|
||
|
||
þ
|
||
MALFORMED
|
||
ÿ
|
||
MALFORMED
|
||
þþÿÿ
|
||
MALFORMED
|
||
|
||
# Examples of an overlong ASCII character
|
||
|
||
À¯
|
||
OVERLONG
|
||
à€¯
|
||
OVERLONG
|
||
ð€€¯
|
||
OVERLONG
|
||
ø€€€¯
|
||
OVERLONG
|
||
ü€€€€¯
|
||
OVERLONG
|
||
|
||
# Maximum overlong sequences
|
||
|
||
Á¿
|
||
OVERLONG
|
||
àŸ¿
|
||
OVERLONG
|
||
ð<EFBFBD>¿¿
|
||
OVERLONG
|
||
ø‡¿¿¿
|
||
OVERLONG
|
||
üƒ¿¿¿¿
|
||
OVERLONG
|
||
|
||
# Overlong representation of the NUL character
|
||
|
||
À€
|
||
OVERLONG
|
||
à€€
|
||
OVERLONG
|
||
ð€€€
|
||
OVERLONG
|
||
ø€€€€
|
||
OVERLONG
|
||
ü€€€€€
|
||
OVERLONG
|
||
|
||
# Illegal code positions
|
||
|
||
# Single UTF-16 surrogates
|
||
|
||
í €
|
||
NOTUNICODE
|
||
d800
|
||
|
||
í¿
|
||
NOTUNICODE
|
||
db7f
|
||
|
||
í®€
|
||
NOTUNICODE
|
||
db80
|
||
|
||
í¯¿
|
||
NOTUNICODE
|
||
dbff
|
||
|
||
í°€
|
||
NOTUNICODE
|
||
dc00
|
||
|
||
í¾€
|
||
NOTUNICODE
|
||
df80
|
||
|
||
í¿¿
|
||
NOTUNICODE
|
||
dfff
|
||
|
||
# Paired UTF-16 surrogates
|
||
|
||
í €í°€
|
||
NOTUNICODE
|
||
d800 dc00
|
||
|
||
í €í¿¿
|
||
NOTUNICODE
|
||
d800 dfff
|
||
|
||
í¿í°€
|
||
NOTUNICODE
|
||
db7f dc00
|
||
|
||
í¿í¿¿
|
||
NOTUNICODE
|
||
db7f dfff
|
||
|
||
󰀀
|
||
NOTUNICODE
|
||
db80 dc00
|
||
|
||
󰏿
|
||
NOTUNICODE
|
||
db80 dfff
|
||
|
||
􏰀
|
||
NOTUNICODE
|
||
dbff dc00
|
||
|
||
􏿿
|
||
NOTUNICODE
|
||
dbff dfff
|
||
|
||
# Other illegal code positions
|
||
|
||
￾
|
||
NOTUNICODE
|
||
fffe
|
||
|
||
ï¿¿
|
||
NOTUNICODE
|
||
ffff
|
||
|
||
################
|
||
#
|
||
# Some more tests, not from Markus Kuhn's file
|
||
#
|
||
|
||
# Mixed plane 0 and higher planes
|
||
|
||
Að<EFBFBD>€€Bô<EFBFBD>¿½C
|
||
VALID
|
||
41 00010000 42 10fffd 43
|