# This file is derived from 
#
#    http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
#    
# Which was created by   Markus Kuhn <mkuhn@acm.org> - 2000-09-02 
#
# lines begining with # and blank lines are ignored
#
# Beyond that, this file consists of a series of test cases. Each test case consists of
# 2 or 3 lines:
#
#  1. A UTF-8 string
#  2. A status
#      VALID      : The string is a valid UTF-8 representation of valid Unicode
#      INCOMPLETE : The string has a partial character at the end
#      NOTUNICODE : The string is valid UTF-8, but the characters represented
#                   are not valid unicode (
#      OVERLONG   : The string includes overlong sequences
#      MALFORMED  : The string is not valid UTF-8
# 3. If the status is VALID or NOTUNICODE, the UCS-4 representation of the string,
#    as a series of hex numbers.

# 1  Some correct UTF-8 text
κόσμε
VALID
03ba 1f79 03c3 03bc 03b5

# 2.1  First possible sequence of a certain length
#
# FIXME - handle NULLS?
#
# [ NULL BYTE ]
#VALID
#0000

€
VALID
0080

ࠀ
VALID
0800

𐀀
VALID
00010000

�����
NOTUNICODE
00200000

������
NOTUNICODE
04000000


VALID
0000007f

߿
VALID
000007ff

￿
NOTUNICODE
0000ffff

����
NOTUNICODE
001fffff

�����
NOTUNICODE
03ffffff

������
NOTUNICODE
7fffffff

# 2.3  Other boundary conditions

퟿
VALID
d7ff


VALID
e000

�
VALID
fffd

􏿽
VALID
0010fffd

􏿿
NOTUNICODE
0010ffff

����
NOTUNICODE
00110000

# 3.1  Unexpected continuation bytes

�
MALFORMED
�
MALFORMED
��
MALFORMED
���
MALFORMED
����
MALFORMED
�����
MALFORMED
������
MALFORMED
�������
MALFORMED
���������������������������������������������������������������
MALFORMED

# 3.2  Lonely start characters

� � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � � 
MALFORMED
� � � � � � � � � � � � � � � � 
MALFORMED
� � � � � � � � 
MALFORMED
� � � � 
MALFORMED
� � 
MALFORMED

# 3.3  Sequences with last continuation byte missing

�
INCOMPLETE
��
INCOMPLETE
���
INCOMPLETE
����
INCOMPLETE
�����
INCOMPLETE
�
INCOMPLETE
�
INCOMPLETE
���
INCOMPLETE
����
INCOMPLETE
�����
INCOMPLETE

# 3.4  Concatenation of incomplete sequences

�����������������������������
MALFORMED

# 3.5  Impossible bytes

�
MALFORMED
�
MALFORMED
����
MALFORMED

#  Examples of an overlong ASCII character

��
OVERLONG
���
OVERLONG
����
OVERLONG
�����
OVERLONG
������
OVERLONG

#  Maximum overlong sequences

��
OVERLONG
���
OVERLONG
����
OVERLONG
�����
OVERLONG
������
OVERLONG

# Overlong representation of the NUL character

��
OVERLONG
���
OVERLONG
����
OVERLONG
�����
OVERLONG
������
OVERLONG

# Illegal code positions

# Single UTF-16 surrogates

���
NOTUNICODE
d800

���
NOTUNICODE
db7f

���
NOTUNICODE
db80

���
NOTUNICODE
dbff

���
NOTUNICODE
dc00

���
NOTUNICODE
df80

���
NOTUNICODE
dfff

# Paired UTF-16 surrogates

������
NOTUNICODE
d800 dc00

������
NOTUNICODE
d800 dfff

������
NOTUNICODE
db7f dc00

������
NOTUNICODE
db7f dfff

������
NOTUNICODE
db80 dc00

������
NOTUNICODE
db80 dfff

������
NOTUNICODE
dbff dc00

������
NOTUNICODE
dbff dfff

# Other illegal code positions

￾
NOTUNICODE
fffe

￿
NOTUNICODE
ffff

################
#
# Some more tests, not from Markus Kuhn's file
#

# Mixed plane 0 and higher planes

A𐀀B􏿽C
VALID
41 00010000 42 10fffd 43