tests: Add some more UTF-8 validation corner cases

The move to c-utf8 for validation has exposed a few new branches where
our existing (fairly comprehensive) UTF-8 validation test suite didn’t
check things.

Add unit tests for those branches, so we keep code coverage.

I’ve validated (with an independent UTF-8 decoder) that the test vectors
are correctly marked as valid/invalid in the test data (so the tests
aren’t just blindly coded to match the behaviour of the new validator
code).

Signed-off-by: Philip Withnall <pwithnall@gnome.org>

Helps: #3481
This commit is contained in:
Philip Withnall 2024-10-03 15:43:35 +01:00
parent e570263483
commit 36e4bb9872
No known key found for this signature in database
GPG Key ID: DCDF5885B1F3ED73

View File

@ -81,8 +81,9 @@ static Test global_test[] = {
{ "\xed\x9f\xbf", -1, 3, TRUE },
{ "\xee\x80\x80", -1, 3, TRUE },
{ "\xef\xbf\xbd", -1, 3, TRUE },
{ "\xf1\x80\x80\x80", -1, 4, TRUE },
{ "\xf4\x8f\xbf\xbf", -1, 4, TRUE },
{ "\xf4\x90\x80\x80", -1, 0, FALSE },
{ "\xf4\x90\x80\x80", -1, 0, FALSE }, /* bigger than U+10FFFF */
/* malformed sequences */
/* continuation bytes */
{ "\x80", -1, 0, FALSE },
@ -94,6 +95,18 @@ static Test global_test[] = {
{ "\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
{ "\x80\xbf\x80\xbf\x80\xbf", -1, 0, FALSE },
{ "\x80\xbf\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
{ "\xe0\xa0\x20", -1, 0, FALSE },
{ "\xe1\x80\x20", -1, 0, FALSE },
{ "\xed\x80\x20", -1, 0, FALSE },
{ "\xf0\xc0\x80\x80", -1, 0, FALSE },
{ "\xf0\x90\x20\x80", -1, 0, FALSE },
{ "\xf0\x90\x80\x20", -1, 0, FALSE },
{ "\xf1\x20\x80\x80", -1, 0, FALSE },
{ "\xf1\x80\x20\x80", -1, 0, FALSE },
{ "\xf1\x80\x80\x20", -1, 0, FALSE },
{ "\xf4\x7f\x80\x80", -1, 0, FALSE },
{ "\xf4\x80\x20\x80", -1, 0, FALSE },
{ "\xf4\x80\x80\x20", -1, 0, FALSE },
/* all possible continuation byte */
{ "\x80", -1, 0, FALSE },
@ -253,6 +266,9 @@ static Test global_test[] = {
{ "\x20\xf0\x80\x80\x80\x20", -1, 1, FALSE },
{ "\x20\xf8\x80\x80\x80\x80\x20", -1, 1, FALSE },
{ "\x20\xfc\x80\x80\x80\x80\x80\x20", -1, 1, FALSE },
{ "\xe0\x9f\x80", -1, 0, FALSE },
{ "\xe0\xc0\x80", -1, 0, FALSE },
{ "\xf0\x8f\x80\x80", -1, 0, FALSE },
/* illegal code positions */
{ "\x20\xed\xa0\x80\x20", -1, 1, FALSE },
{ "\x20\xed\xad\xbf\x20", -1, 1, FALSE },
@ -270,6 +286,14 @@ static Test global_test[] = {
{ "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
{ "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
/* ASCII boundaries */
{ "\x00", 1, 0, FALSE },
{ "\x01", -1, 1, TRUE },
{ "\x02", -1, 1, TRUE },
{ "\x7d", -1, 1, TRUE },
{ "\x7e", -1, 1, TRUE },
{ "\x7f", -1, 1, TRUE },
{ NULL, 0, 0, 0 }
};