mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2025-02-12 21:50:36 +01:00
move $enable_debug down below checks for GCC to avoid setting CFLAGS
Fri Jan 5 11:25:42 2001 Owen Taylor <otaylor@redhat.com> * configure.in (PACKAGE): move $enable_debug down below checks for GCC to avoid setting CFLAGS prematurely, change checks to avoid adding -g twice. * gutf8.c (g_ucs4_to_utf8): Support len < 0 to mean 0 termination. * gutf8.c (g_utf8_to_ucs4): Terminate result with 0. * tests/mainloop-test.c (main): Fix uses of g_main_loop_destroy(). * tests/unicode-encoding.c tests/Makefile.am tests/utf8.txt: Tests for unicode-conversion code. * gconvert.c (g_convert, g_convert_with_fallback): work around a couple of GNU libc bugs. * gconvert.[ch] (g_{locale,filename}_{to,from}_utf8): Standardize arguments to match g_convert(). Document. * gunicode.[ch]: - Implement conversion functions to and from UTF-16 - Standardize unicode conversion functions on prototype like g_convert. - Add a lot of error checking to unicode conversion functions. * gunicode.[ch] (g_utf8_to_ucs4_fast): Add fast, non-checking variant of g_utf8_to_ucs4. * gutf8.c (g_utf8_validate): - add g_return_if_fail (str != NULL). - add checks for overlong strings, non-valid Unicode characters (>= 110000) and single surrogates.
This commit is contained in:
parent
29cff66fc1
commit
956f00ed96
37
ChangeLog
37
ChangeLog
@ -1,3 +1,40 @@
|
||||
Fri Jan 5 11:25:42 2001 Owen Taylor <otaylor@redhat.com>
|
||||
|
||||
* configure.in (PACKAGE): move $enable_debug down below
|
||||
checks for GCC to avoid setting CFLAGS prematurely,
|
||||
change checks to avoid adding -g twice.
|
||||
|
||||
* gutf8.c (g_ucs4_to_utf8): Support len < 0 to mean
|
||||
0 termination.
|
||||
|
||||
* gutf8.c (g_utf8_to_ucs4): Terminate result with 0.
|
||||
|
||||
* tests/mainloop-test.c (main): Fix uses of
|
||||
g_main_loop_destroy().
|
||||
|
||||
* tests/unicode-encoding.c tests/Makefile.am tests/utf8.txt:
|
||||
Tests for unicode-conversion code.
|
||||
|
||||
* gconvert.c (g_convert, g_convert_with_fallback): work around
|
||||
a couple of GNU libc bugs.
|
||||
|
||||
* gconvert.[ch] (g_{locale,filename}_{to,from}_utf8): Standardize
|
||||
arguments to match g_convert(). Document.
|
||||
|
||||
* gunicode.[ch]:
|
||||
- Implement conversion functions to and from UTF-16
|
||||
- Standardize unicode conversion functions on prototype like
|
||||
g_convert.
|
||||
- Add a lot of error checking to unicode conversion functions.
|
||||
|
||||
* gunicode.[ch] (g_utf8_to_ucs4_fast): Add fast, non-checking
|
||||
variant of g_utf8_to_ucs4.
|
||||
|
||||
* gutf8.c (g_utf8_validate):
|
||||
- add g_return_if_fail (str != NULL).
|
||||
- add checks for overlong strings, non-valid Unicode characters (>= 110000)
|
||||
and single surrogates.
|
||||
|
||||
2001-01-05 Tor Lillqvist <tml@iki.fi>
|
||||
|
||||
* testglib.c (main): Add test for g_path_skip_root().
|
||||
|
@ -1,3 +1,40 @@
|
||||
Fri Jan 5 11:25:42 2001 Owen Taylor <otaylor@redhat.com>
|
||||
|
||||
* configure.in (PACKAGE): move $enable_debug down below
|
||||
checks for GCC to avoid setting CFLAGS prematurely,
|
||||
change checks to avoid adding -g twice.
|
||||
|
||||
* gutf8.c (g_ucs4_to_utf8): Support len < 0 to mean
|
||||
0 termination.
|
||||
|
||||
* gutf8.c (g_utf8_to_ucs4): Terminate result with 0.
|
||||
|
||||
* tests/mainloop-test.c (main): Fix uses of
|
||||
g_main_loop_destroy().
|
||||
|
||||
* tests/unicode-encoding.c tests/Makefile.am tests/utf8.txt:
|
||||
Tests for unicode-conversion code.
|
||||
|
||||
* gconvert.c (g_convert, g_convert_with_fallback): work around
|
||||
a couple of GNU libc bugs.
|
||||
|
||||
* gconvert.[ch] (g_{locale,filename}_{to,from}_utf8): Standardize
|
||||
arguments to match g_convert(). Document.
|
||||
|
||||
* gunicode.[ch]:
|
||||
- Implement conversion functions to and from UTF-16
|
||||
- Standardize unicode conversion functions on prototype like
|
||||
g_convert.
|
||||
- Add a lot of error checking to unicode conversion functions.
|
||||
|
||||
* gunicode.[ch] (g_utf8_to_ucs4_fast): Add fast, non-checking
|
||||
variant of g_utf8_to_ucs4.
|
||||
|
||||
* gutf8.c (g_utf8_validate):
|
||||
- add g_return_if_fail (str != NULL).
|
||||
- add checks for overlong strings, non-valid Unicode characters (>= 110000)
|
||||
and single surrogates.
|
||||
|
||||
2001-01-05 Tor Lillqvist <tml@iki.fi>
|
||||
|
||||
* testglib.c (main): Add test for g_path_skip_root().
|
||||
|
@ -1,3 +1,40 @@
|
||||
Fri Jan 5 11:25:42 2001 Owen Taylor <otaylor@redhat.com>
|
||||
|
||||
* configure.in (PACKAGE): move $enable_debug down below
|
||||
checks for GCC to avoid setting CFLAGS prematurely,
|
||||
change checks to avoid adding -g twice.
|
||||
|
||||
* gutf8.c (g_ucs4_to_utf8): Support len < 0 to mean
|
||||
0 termination.
|
||||
|
||||
* gutf8.c (g_utf8_to_ucs4): Terminate result with 0.
|
||||
|
||||
* tests/mainloop-test.c (main): Fix uses of
|
||||
g_main_loop_destroy().
|
||||
|
||||
* tests/unicode-encoding.c tests/Makefile.am tests/utf8.txt:
|
||||
Tests for unicode-conversion code.
|
||||
|
||||
* gconvert.c (g_convert, g_convert_with_fallback): work around
|
||||
a couple of GNU libc bugs.
|
||||
|
||||
* gconvert.[ch] (g_{locale,filename}_{to,from}_utf8): Standardize
|
||||
arguments to match g_convert(). Document.
|
||||
|
||||
* gunicode.[ch]:
|
||||
- Implement conversion functions to and from UTF-16
|
||||
- Standardize unicode conversion functions on prototype like
|
||||
g_convert.
|
||||
- Add a lot of error checking to unicode conversion functions.
|
||||
|
||||
* gunicode.[ch] (g_utf8_to_ucs4_fast): Add fast, non-checking
|
||||
variant of g_utf8_to_ucs4.
|
||||
|
||||
* gutf8.c (g_utf8_validate):
|
||||
- add g_return_if_fail (str != NULL).
|
||||
- add checks for overlong strings, non-valid Unicode characters (>= 110000)
|
||||
and single surrogates.
|
||||
|
||||
2001-01-05 Tor Lillqvist <tml@iki.fi>
|
||||
|
||||
* testglib.c (main): Add test for g_path_skip_root().
|
||||
|
@ -1,3 +1,40 @@
|
||||
Fri Jan 5 11:25:42 2001 Owen Taylor <otaylor@redhat.com>
|
||||
|
||||
* configure.in (PACKAGE): move $enable_debug down below
|
||||
checks for GCC to avoid setting CFLAGS prematurely,
|
||||
change checks to avoid adding -g twice.
|
||||
|
||||
* gutf8.c (g_ucs4_to_utf8): Support len < 0 to mean
|
||||
0 termination.
|
||||
|
||||
* gutf8.c (g_utf8_to_ucs4): Terminate result with 0.
|
||||
|
||||
* tests/mainloop-test.c (main): Fix uses of
|
||||
g_main_loop_destroy().
|
||||
|
||||
* tests/unicode-encoding.c tests/Makefile.am tests/utf8.txt:
|
||||
Tests for unicode-conversion code.
|
||||
|
||||
* gconvert.c (g_convert, g_convert_with_fallback): work around
|
||||
a couple of GNU libc bugs.
|
||||
|
||||
* gconvert.[ch] (g_{locale,filename}_{to,from}_utf8): Standardize
|
||||
arguments to match g_convert(). Document.
|
||||
|
||||
* gunicode.[ch]:
|
||||
- Implement conversion functions to and from UTF-16
|
||||
- Standardize unicode conversion functions on prototype like
|
||||
g_convert.
|
||||
- Add a lot of error checking to unicode conversion functions.
|
||||
|
||||
* gunicode.[ch] (g_utf8_to_ucs4_fast): Add fast, non-checking
|
||||
variant of g_utf8_to_ucs4.
|
||||
|
||||
* gutf8.c (g_utf8_validate):
|
||||
- add g_return_if_fail (str != NULL).
|
||||
- add checks for overlong strings, non-valid Unicode characters (>= 110000)
|
||||
and single surrogates.
|
||||
|
||||
2001-01-05 Tor Lillqvist <tml@iki.fi>
|
||||
|
||||
* testglib.c (main): Add test for g_path_skip_root().
|
||||
|
@ -1,3 +1,40 @@
|
||||
Fri Jan 5 11:25:42 2001 Owen Taylor <otaylor@redhat.com>
|
||||
|
||||
* configure.in (PACKAGE): move $enable_debug down below
|
||||
checks for GCC to avoid setting CFLAGS prematurely,
|
||||
change checks to avoid adding -g twice.
|
||||
|
||||
* gutf8.c (g_ucs4_to_utf8): Support len < 0 to mean
|
||||
0 termination.
|
||||
|
||||
* gutf8.c (g_utf8_to_ucs4): Terminate result with 0.
|
||||
|
||||
* tests/mainloop-test.c (main): Fix uses of
|
||||
g_main_loop_destroy().
|
||||
|
||||
* tests/unicode-encoding.c tests/Makefile.am tests/utf8.txt:
|
||||
Tests for unicode-conversion code.
|
||||
|
||||
* gconvert.c (g_convert, g_convert_with_fallback): work around
|
||||
a couple of GNU libc bugs.
|
||||
|
||||
* gconvert.[ch] (g_{locale,filename}_{to,from}_utf8): Standardize
|
||||
arguments to match g_convert(). Document.
|
||||
|
||||
* gunicode.[ch]:
|
||||
- Implement conversion functions to and from UTF-16
|
||||
- Standardize unicode conversion functions on prototype like
|
||||
g_convert.
|
||||
- Add a lot of error checking to unicode conversion functions.
|
||||
|
||||
* gunicode.[ch] (g_utf8_to_ucs4_fast): Add fast, non-checking
|
||||
variant of g_utf8_to_ucs4.
|
||||
|
||||
* gutf8.c (g_utf8_validate):
|
||||
- add g_return_if_fail (str != NULL).
|
||||
- add checks for overlong strings, non-valid Unicode characters (>= 110000)
|
||||
and single surrogates.
|
||||
|
||||
2001-01-05 Tor Lillqvist <tml@iki.fi>
|
||||
|
||||
* testglib.c (main): Add test for g_path_skip_root().
|
||||
|
@ -1,3 +1,40 @@
|
||||
Fri Jan 5 11:25:42 2001 Owen Taylor <otaylor@redhat.com>
|
||||
|
||||
* configure.in (PACKAGE): move $enable_debug down below
|
||||
checks for GCC to avoid setting CFLAGS prematurely,
|
||||
change checks to avoid adding -g twice.
|
||||
|
||||
* gutf8.c (g_ucs4_to_utf8): Support len < 0 to mean
|
||||
0 termination.
|
||||
|
||||
* gutf8.c (g_utf8_to_ucs4): Terminate result with 0.
|
||||
|
||||
* tests/mainloop-test.c (main): Fix uses of
|
||||
g_main_loop_destroy().
|
||||
|
||||
* tests/unicode-encoding.c tests/Makefile.am tests/utf8.txt:
|
||||
Tests for unicode-conversion code.
|
||||
|
||||
* gconvert.c (g_convert, g_convert_with_fallback): work around
|
||||
a couple of GNU libc bugs.
|
||||
|
||||
* gconvert.[ch] (g_{locale,filename}_{to,from}_utf8): Standardize
|
||||
arguments to match g_convert(). Document.
|
||||
|
||||
* gunicode.[ch]:
|
||||
- Implement conversion functions to and from UTF-16
|
||||
- Standardize unicode conversion functions on prototype like
|
||||
g_convert.
|
||||
- Add a lot of error checking to unicode conversion functions.
|
||||
|
||||
* gunicode.[ch] (g_utf8_to_ucs4_fast): Add fast, non-checking
|
||||
variant of g_utf8_to_ucs4.
|
||||
|
||||
* gutf8.c (g_utf8_validate):
|
||||
- add g_return_if_fail (str != NULL).
|
||||
- add checks for overlong strings, non-valid Unicode characters (>= 110000)
|
||||
and single surrogates.
|
||||
|
||||
2001-01-05 Tor Lillqvist <tml@iki.fi>
|
||||
|
||||
* testglib.c (main): Add test for g_path_skip_root().
|
||||
|
@ -1,3 +1,40 @@
|
||||
Fri Jan 5 11:25:42 2001 Owen Taylor <otaylor@redhat.com>
|
||||
|
||||
* configure.in (PACKAGE): move $enable_debug down below
|
||||
checks for GCC to avoid setting CFLAGS prematurely,
|
||||
change checks to avoid adding -g twice.
|
||||
|
||||
* gutf8.c (g_ucs4_to_utf8): Support len < 0 to mean
|
||||
0 termination.
|
||||
|
||||
* gutf8.c (g_utf8_to_ucs4): Terminate result with 0.
|
||||
|
||||
* tests/mainloop-test.c (main): Fix uses of
|
||||
g_main_loop_destroy().
|
||||
|
||||
* tests/unicode-encoding.c tests/Makefile.am tests/utf8.txt:
|
||||
Tests for unicode-conversion code.
|
||||
|
||||
* gconvert.c (g_convert, g_convert_with_fallback): work around
|
||||
a couple of GNU libc bugs.
|
||||
|
||||
* gconvert.[ch] (g_{locale,filename}_{to,from}_utf8): Standardize
|
||||
arguments to match g_convert(). Document.
|
||||
|
||||
* gunicode.[ch]:
|
||||
- Implement conversion functions to and from UTF-16
|
||||
- Standardize unicode conversion functions on prototype like
|
||||
g_convert.
|
||||
- Add a lot of error checking to unicode conversion functions.
|
||||
|
||||
* gunicode.[ch] (g_utf8_to_ucs4_fast): Add fast, non-checking
|
||||
variant of g_utf8_to_ucs4.
|
||||
|
||||
* gutf8.c (g_utf8_validate):
|
||||
- add g_return_if_fail (str != NULL).
|
||||
- add checks for overlong strings, non-valid Unicode characters (>= 110000)
|
||||
and single surrogates.
|
||||
|
||||
2001-01-05 Tor Lillqvist <tml@iki.fi>
|
||||
|
||||
* testglib.c (main): Add test for g_path_skip_root().
|
||||
|
@ -1,3 +1,40 @@
|
||||
Fri Jan 5 11:25:42 2001 Owen Taylor <otaylor@redhat.com>
|
||||
|
||||
* configure.in (PACKAGE): move $enable_debug down below
|
||||
checks for GCC to avoid setting CFLAGS prematurely,
|
||||
change checks to avoid adding -g twice.
|
||||
|
||||
* gutf8.c (g_ucs4_to_utf8): Support len < 0 to mean
|
||||
0 termination.
|
||||
|
||||
* gutf8.c (g_utf8_to_ucs4): Terminate result with 0.
|
||||
|
||||
* tests/mainloop-test.c (main): Fix uses of
|
||||
g_main_loop_destroy().
|
||||
|
||||
* tests/unicode-encoding.c tests/Makefile.am tests/utf8.txt:
|
||||
Tests for unicode-conversion code.
|
||||
|
||||
* gconvert.c (g_convert, g_convert_with_fallback): work around
|
||||
a couple of GNU libc bugs.
|
||||
|
||||
* gconvert.[ch] (g_{locale,filename}_{to,from}_utf8): Standardize
|
||||
arguments to match g_convert(). Document.
|
||||
|
||||
* gunicode.[ch]:
|
||||
- Implement conversion functions to and from UTF-16
|
||||
- Standardize unicode conversion functions on prototype like
|
||||
g_convert.
|
||||
- Add a lot of error checking to unicode conversion functions.
|
||||
|
||||
* gunicode.[ch] (g_utf8_to_ucs4_fast): Add fast, non-checking
|
||||
variant of g_utf8_to_ucs4.
|
||||
|
||||
* gutf8.c (g_utf8_validate):
|
||||
- add g_return_if_fail (str != NULL).
|
||||
- add checks for overlong strings, non-valid Unicode characters (>= 110000)
|
||||
and single surrogates.
|
||||
|
||||
2001-01-05 Tor Lillqvist <tml@iki.fi>
|
||||
|
||||
* testglib.c (main): Add test for g_path_skip_root().
|
||||
|
24
configure.in
24
configure.in
@ -114,15 +114,6 @@ if test "x$enable_threads" != "xyes"; then
|
||||
enable_threads=no
|
||||
fi
|
||||
|
||||
if test "x$enable_debug" = "xyes"; then
|
||||
test "$cflags_set" = set || CFLAGS="$CFLAGS -g"
|
||||
GLIB_DEBUG_FLAGS="-DG_ENABLE_DEBUG"
|
||||
else
|
||||
if test "x$enable_debug" = "xno"; then
|
||||
GLIB_DEBUG_FLAGS="-DG_DISABLE_ASSERT -DG_DISABLE_CHECKS"
|
||||
fi
|
||||
fi
|
||||
|
||||
AC_DEFINE_UNQUOTED(G_COMPILED_WITH_DEBUGGING, "${enable_debug}",
|
||||
[Whether glib was compiled with debugging enabled])
|
||||
|
||||
@ -154,6 +145,21 @@ AC_PROG_CC
|
||||
AM_PROG_CC_STDC
|
||||
AC_PROG_INSTALL
|
||||
|
||||
if test "x$enable_debug" = "xyes"; then
|
||||
if test x$cflags_set != xset ; then
|
||||
case " $CFLAGS " in
|
||||
*[[\ \ ]]-g[[\ \ ]]*) ;;
|
||||
*) CFLAGS="$CFLAGS -g" ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
GLIB_DEBUG_FLAGS="-DG_ENABLE_DEBUG"
|
||||
else
|
||||
if test "x$enable_debug" = "xno"; then
|
||||
GLIB_DEBUG_FLAGS="-DG_DISABLE_ASSERT -DG_DISABLE_CHECKS"
|
||||
fi
|
||||
fi
|
||||
|
||||
# define a MAINT-like variable REBUILD which is set if Perl
|
||||
# and awk are found, so autogenerated sources can be rebuilt
|
||||
AC_PROG_AWK
|
||||
|
258
gconvert.c
258
gconvert.c
@ -170,7 +170,11 @@ g_convert (const gchar *str,
|
||||
|
||||
p = str;
|
||||
inbytes_remaining = len;
|
||||
outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
|
||||
|
||||
/* Due to a GLIBC bug, round outbuf_size up to a multiple of 4 */
|
||||
/* + 1 for nul in case len == 1 */
|
||||
outbuf_size = ((len + 3) & ~3) + 1;
|
||||
|
||||
outbytes_remaining = outbuf_size - 1; /* -1 for nul */
|
||||
outp = dest = g_malloc (outbuf_size);
|
||||
|
||||
@ -188,11 +192,20 @@ g_convert (const gchar *str,
|
||||
case E2BIG:
|
||||
{
|
||||
size_t used = outp - dest;
|
||||
outbuf_size *= 2;
|
||||
dest = g_realloc (dest, outbuf_size);
|
||||
|
||||
outp = dest + used;
|
||||
outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
|
||||
/* glibc's iconv can return E2BIG even if there is space
|
||||
* remaining if an internal buffer is exhausted. The
|
||||
* folllowing is a heuristic to catch this. The 16 is
|
||||
* pretty arbitrary.
|
||||
*/
|
||||
if (used + 16 > outbuf_size)
|
||||
{
|
||||
outbuf_size = (outbuf_size - 1) * 2 + 1;
|
||||
dest = g_realloc (dest, outbuf_size);
|
||||
|
||||
outp = dest + used;
|
||||
outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
|
||||
}
|
||||
|
||||
goto again;
|
||||
}
|
||||
@ -353,7 +366,9 @@ g_convert_with_fallback (const gchar *str,
|
||||
* for the original string while we are converting the fallback
|
||||
*/
|
||||
p = utf8;
|
||||
outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
|
||||
/* Due to a GLIBC bug, round outbuf_size up to a multiple of 4 */
|
||||
/* + 1 for nul in case len == 1 */
|
||||
outbuf_size = ((len + 3) & ~3) + 1;
|
||||
outbytes_remaining = outbuf_size - 1; /* -1 for nul */
|
||||
outp = dest = g_malloc (outbuf_size);
|
||||
|
||||
@ -373,11 +388,20 @@ g_convert_with_fallback (const gchar *str,
|
||||
case E2BIG:
|
||||
{
|
||||
size_t used = outp - dest;
|
||||
outbuf_size *= 2;
|
||||
dest = g_realloc (dest, outbuf_size);
|
||||
|
||||
outp = dest + used;
|
||||
outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
|
||||
|
||||
/* glibc's iconv can return E2BIG even if there is space
|
||||
* remaining if an internal buffer is exhausted. The
|
||||
* folllowing is a heuristic to catch this. The 16 is
|
||||
* pretty arbitrary.
|
||||
*/
|
||||
if (used + 16 > outbuf_size)
|
||||
{
|
||||
outbuf_size = (outbuf_size - 1) * 2 + 1;
|
||||
dest = g_realloc (dest, outbuf_size);
|
||||
|
||||
outp = dest + used;
|
||||
outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
@ -458,18 +482,44 @@ g_convert_with_fallback (const gchar *str,
|
||||
/*
|
||||
* g_locale_to_utf8
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* g_locale_to_utf8:
|
||||
* @opsysstring: a string in the encoding of the current locale
|
||||
* @len: the length of the string, or -1 if the string is
|
||||
* NULL-terminated.
|
||||
* @bytes_read: location to store the number of bytes in the
|
||||
* input string that were successfully converted, or %NULL.
|
||||
* Even if the conversion was succesful, this may be
|
||||
* less than len if there were partial characters
|
||||
* at the end of the input. If the error
|
||||
* G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
|
||||
* stored will the byte fofset after the last valid
|
||||
* input sequence.
|
||||
* @bytes_written: the stored in the output buffer (not including the
|
||||
* terminating nul.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError may occur.
|
||||
*
|
||||
* Converts a string which is in the encoding used for strings by
|
||||
* the C runtime (usually the same as that used by the operating
|
||||
* system) in the current locale into a UTF-8 string.
|
||||
*/
|
||||
|
||||
*
|
||||
* Return value: The converted string, or %NULL on an error.
|
||||
**/
|
||||
gchar *
|
||||
g_locale_to_utf8 (const gchar *opsysstring, GError **error)
|
||||
g_locale_to_utf8 (const gchar *opsysstring,
|
||||
gint len,
|
||||
gint *bytes_read,
|
||||
gint *bytes_written,
|
||||
GError **error)
|
||||
{
|
||||
#ifdef G_OS_WIN32
|
||||
|
||||
gint i, clen, wclen, first;
|
||||
const gint len = strlen (opsysstring);
|
||||
gint i, clen, total_len, wclen, first;
|
||||
const gint len = len < 0 ? strlen (opsysstring) : len;
|
||||
wchar_t *wcs, wc;
|
||||
gchar *result, *bp;
|
||||
const wchar_t *wcp;
|
||||
@ -478,26 +528,26 @@ g_locale_to_utf8 (const gchar *opsysstring, GError **error)
|
||||
wclen = MultiByteToWideChar (CP_ACP, 0, opsysstring, len, wcs, len);
|
||||
|
||||
wcp = wcs;
|
||||
clen = 0;
|
||||
total_len = 0;
|
||||
for (i = 0; i < wclen; i++)
|
||||
{
|
||||
wc = *wcp++;
|
||||
|
||||
if (wc < 0x80)
|
||||
clen += 1;
|
||||
total_len += 1;
|
||||
else if (wc < 0x800)
|
||||
clen += 2;
|
||||
total_len += 2;
|
||||
else if (wc < 0x10000)
|
||||
clen += 3;
|
||||
total_len += 3;
|
||||
else if (wc < 0x200000)
|
||||
clen += 4;
|
||||
total_len += 4;
|
||||
else if (wc < 0x4000000)
|
||||
clen += 5;
|
||||
total_len += 5;
|
||||
else
|
||||
clen += 6;
|
||||
total_len += 6;
|
||||
}
|
||||
|
||||
result = g_malloc (clen + 1);
|
||||
result = g_malloc (total_len + 1);
|
||||
|
||||
wcp = wcs;
|
||||
bp = result;
|
||||
@ -553,6 +603,11 @@ g_locale_to_utf8 (const gchar *opsysstring, GError **error)
|
||||
|
||||
g_free (wcs);
|
||||
|
||||
if (bytes_read)
|
||||
*bytes_read = len;
|
||||
if (bytes_written)
|
||||
*bytes_written = total_len;
|
||||
|
||||
return result;
|
||||
|
||||
#else
|
||||
@ -562,26 +617,48 @@ g_locale_to_utf8 (const gchar *opsysstring, GError **error)
|
||||
if (g_get_charset (&charset))
|
||||
return g_strdup (opsysstring);
|
||||
|
||||
str = g_convert (opsysstring, strlen (opsysstring),
|
||||
"UTF-8", charset, NULL, NULL, error);
|
||||
str = g_convert (opsysstring, len,
|
||||
"UTF-8", charset, bytes_read, bytes_written, error);
|
||||
|
||||
return str;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* g_locale_from_utf8
|
||||
*
|
||||
* The reverse of g_locale_to_utf8.
|
||||
*/
|
||||
|
||||
/**
|
||||
* g_locale_from_utf8:
|
||||
* @utf8string: a UTF-8 encoded string
|
||||
* @len: the length of the string, or -1 if the string is
|
||||
* NULL-terminated.
|
||||
* @bytes_read: location to store the number of bytes in the
|
||||
* input string that were successfully converted, or %NULL.
|
||||
* Even if the conversion was succesful, this may be
|
||||
* less than len if there were partial characters
|
||||
* at the end of the input. If the error
|
||||
* G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
|
||||
* stored will the byte fofset after the last valid
|
||||
* input sequence.
|
||||
* @bytes_written: the stored in the output buffer (not including the
|
||||
* terminating nul.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError may occur.
|
||||
*
|
||||
* Converts a string from UTF-8 to the encoding used for strings by
|
||||
* the C runtime (usually the same as that used by the operating
|
||||
* system) in the current locale.
|
||||
*
|
||||
* Return value: The converted string, or %NULL on an error.
|
||||
**/
|
||||
gchar *
|
||||
g_locale_from_utf8 (const gchar *utf8string, GError **error)
|
||||
g_locale_from_utf8 (const gchar *utf8string,
|
||||
gint len,
|
||||
gint *bytes_read,
|
||||
gint *bytes_written,
|
||||
GError **error)
|
||||
{
|
||||
#ifdef G_OS_WIN32
|
||||
|
||||
gint i, mask, clen, mblen;
|
||||
const gint len = strlen (utf8string);
|
||||
const gint len = len < 0 ? strlen (utf8string) : len;
|
||||
wchar_t *wcs, *wcp;
|
||||
gchar *result;
|
||||
guchar *cp, *end, c;
|
||||
@ -671,6 +748,11 @@ g_locale_from_utf8 (const gchar *utf8string, GError **error)
|
||||
result[mblen] = 0;
|
||||
g_free (wcs);
|
||||
|
||||
if (bytes_read)
|
||||
*bytes_read = len;
|
||||
if (bytes_written)
|
||||
*bytes_written = mblen;
|
||||
|
||||
return result;
|
||||
|
||||
#else
|
||||
@ -681,39 +763,123 @@ g_locale_from_utf8 (const gchar *utf8string, GError **error)
|
||||
return g_strdup (utf8string);
|
||||
|
||||
str = g_convert (utf8string, strlen (utf8string),
|
||||
charset, "UTF-8", NULL, NULL, error);
|
||||
charset, "UTF-8", bytes_read, bytes_written, error);
|
||||
|
||||
return str;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Filenames are in UTF-8 unless specificially requested otherwise */
|
||||
|
||||
/**
|
||||
* g_filename_to_utf8:
|
||||
* @opsysstring: a string in the encoding for filenames
|
||||
* @len: the length of the string, or -1 if the string is
|
||||
* NULL-terminated.
|
||||
* @bytes_read: location to store the number of bytes in the
|
||||
* input string that were successfully converted, or %NULL.
|
||||
* Even if the conversion was succesful, this may be
|
||||
* less than len if there were partial characters
|
||||
* at the end of the input. If the error
|
||||
* G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
|
||||
* stored will the byte fofset after the last valid
|
||||
* input sequence.
|
||||
* @bytes_written: the stored in the output buffer (not including the
|
||||
* terminating nul.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError may occur.
|
||||
*
|
||||
* Converts a string which is in the encoding used for filenames
|
||||
* into a UTF-8 string.
|
||||
*
|
||||
* Return value: The converted string, or %NULL on an error.
|
||||
**/
|
||||
gchar*
|
||||
g_filename_to_utf8 (const gchar *string, GError **error)
|
||||
|
||||
g_filename_to_utf8 (const gchar *opsysstring,
|
||||
gint len,
|
||||
gint *bytes_read,
|
||||
gint *bytes_written,
|
||||
GError **error)
|
||||
{
|
||||
#ifdef G_OS_WIN32
|
||||
return g_locale_to_utf8 (string, error);
|
||||
return g_locale_to_utf8 (opsysstring, len,
|
||||
bytes_read, bytes_written,
|
||||
error);
|
||||
#else
|
||||
if (getenv ("G_BROKEN_FILENAMES"))
|
||||
return g_locale_to_utf8 (string, error);
|
||||
return g_locale_to_utf8 (opsysstring, len,
|
||||
bytes_read, bytes_written,
|
||||
error);
|
||||
|
||||
return g_strdup (string);
|
||||
if (bytes_read || bytes_written)
|
||||
{
|
||||
gint len = strlen (opsysstring);
|
||||
|
||||
if (bytes_read)
|
||||
*bytes_read = len;
|
||||
if (bytes_written)
|
||||
*bytes_written = len;
|
||||
}
|
||||
|
||||
if (len < 0)
|
||||
return g_strdup (opsysstring);
|
||||
else
|
||||
return g_strndup (opsysstring, len);
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* g_filename_from_utf8:
|
||||
* @utf8string: a UTF-8 encoded string
|
||||
* @len: the length of the string, or -1 if the string is
|
||||
* NULL-terminated.
|
||||
* @bytes_read: location to store the number of bytes in the
|
||||
* input string that were successfully converted, or %NULL.
|
||||
* Even if the conversion was succesful, this may be
|
||||
* less than len if there were partial characters
|
||||
* at the end of the input. If the error
|
||||
* G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
|
||||
* stored will the byte fofset after the last valid
|
||||
* input sequence.
|
||||
* @bytes_written: the stored in the output buffer (not including the
|
||||
* terminating nul.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError may occur.
|
||||
*
|
||||
* Converts a string from UTF-8 to the encoding used for filenames.
|
||||
*
|
||||
* Return value: The converted string, or %NULL on an error.
|
||||
**/
|
||||
gchar*
|
||||
g_filename_from_utf8 (const gchar *string, GError **error)
|
||||
g_filename_from_utf8 (const gchar *utf8string,
|
||||
gint len,
|
||||
gint *bytes_read,
|
||||
gint *bytes_written,
|
||||
GError **error)
|
||||
{
|
||||
#ifdef G_OS_WIN32
|
||||
return g_locale_from_utf8 (string, error);
|
||||
return g_locale_from_utf8 (utf8string, len,
|
||||
bytes_read, bytes_written,
|
||||
error);
|
||||
#else
|
||||
if (getenv ("G_BROKEN_FILENAMES"))
|
||||
return g_locale_from_utf8 (string, error);
|
||||
return g_locale_from_utf8 (utf8string, len,
|
||||
bytes_read, bytes_written,
|
||||
error);
|
||||
|
||||
return g_strdup (string);
|
||||
if (bytes_read || bytes_written)
|
||||
{
|
||||
gint len = strlen (utf8string);
|
||||
|
||||
if (bytes_read)
|
||||
*bytes_read = len;
|
||||
if (bytes_written)
|
||||
*bytes_written = len;
|
||||
}
|
||||
|
||||
if (len < 0)
|
||||
return g_strdup (utf8string);
|
||||
else
|
||||
return g_strndup (utf8string, len);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
24
gconvert.h
24
gconvert.h
@ -76,14 +76,30 @@ gchar* g_convert_with_fallback (const gchar *str,
|
||||
|
||||
/* Convert between libc's idea of strings and UTF-8.
|
||||
*/
|
||||
gchar* g_locale_to_utf8 (const gchar *opsysstring, GError **error);
|
||||
gchar* g_locale_from_utf8 (const gchar *utf8string, GError **error);
|
||||
gchar* g_locale_to_utf8 (const gchar *opsysstring,
|
||||
gint len,
|
||||
gint *bytes_read,
|
||||
gint *bytes_written,
|
||||
GError **error);
|
||||
gchar* g_locale_from_utf8 (const gchar *utf8string,
|
||||
gint len,
|
||||
gint *bytes_read,
|
||||
gint *bytes_written,
|
||||
GError **error);
|
||||
|
||||
/* Convert between the operating system (or C runtime)
|
||||
* representation of file names and UTF-8.
|
||||
*/
|
||||
gchar* g_filename_to_utf8 (const gchar *opsysstring, GError **error);
|
||||
gchar* g_filename_from_utf8 (const gchar *utf8string, GError **error);
|
||||
gchar* g_filename_to_utf8 (const gchar *opsysstring,
|
||||
gint len,
|
||||
gint *bytes_read,
|
||||
gint *bytes_written,
|
||||
GError **error);
|
||||
gchar* g_filename_from_utf8 (const gchar *utf8string,
|
||||
gint len,
|
||||
gint *bytes_read,
|
||||
gint *bytes_written,
|
||||
GError **error);
|
||||
|
||||
G_END_DECLS
|
||||
|
||||
|
258
glib/gconvert.c
258
glib/gconvert.c
@ -170,7 +170,11 @@ g_convert (const gchar *str,
|
||||
|
||||
p = str;
|
||||
inbytes_remaining = len;
|
||||
outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
|
||||
|
||||
/* Due to a GLIBC bug, round outbuf_size up to a multiple of 4 */
|
||||
/* + 1 for nul in case len == 1 */
|
||||
outbuf_size = ((len + 3) & ~3) + 1;
|
||||
|
||||
outbytes_remaining = outbuf_size - 1; /* -1 for nul */
|
||||
outp = dest = g_malloc (outbuf_size);
|
||||
|
||||
@ -188,11 +192,20 @@ g_convert (const gchar *str,
|
||||
case E2BIG:
|
||||
{
|
||||
size_t used = outp - dest;
|
||||
outbuf_size *= 2;
|
||||
dest = g_realloc (dest, outbuf_size);
|
||||
|
||||
outp = dest + used;
|
||||
outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
|
||||
/* glibc's iconv can return E2BIG even if there is space
|
||||
* remaining if an internal buffer is exhausted. The
|
||||
* folllowing is a heuristic to catch this. The 16 is
|
||||
* pretty arbitrary.
|
||||
*/
|
||||
if (used + 16 > outbuf_size)
|
||||
{
|
||||
outbuf_size = (outbuf_size - 1) * 2 + 1;
|
||||
dest = g_realloc (dest, outbuf_size);
|
||||
|
||||
outp = dest + used;
|
||||
outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
|
||||
}
|
||||
|
||||
goto again;
|
||||
}
|
||||
@ -353,7 +366,9 @@ g_convert_with_fallback (const gchar *str,
|
||||
* for the original string while we are converting the fallback
|
||||
*/
|
||||
p = utf8;
|
||||
outbuf_size = len + 1; /* + 1 for nul in case len == 1 */
|
||||
/* Due to a GLIBC bug, round outbuf_size up to a multiple of 4 */
|
||||
/* + 1 for nul in case len == 1 */
|
||||
outbuf_size = ((len + 3) & ~3) + 1;
|
||||
outbytes_remaining = outbuf_size - 1; /* -1 for nul */
|
||||
outp = dest = g_malloc (outbuf_size);
|
||||
|
||||
@ -373,11 +388,20 @@ g_convert_with_fallback (const gchar *str,
|
||||
case E2BIG:
|
||||
{
|
||||
size_t used = outp - dest;
|
||||
outbuf_size *= 2;
|
||||
dest = g_realloc (dest, outbuf_size);
|
||||
|
||||
outp = dest + used;
|
||||
outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
|
||||
|
||||
/* glibc's iconv can return E2BIG even if there is space
|
||||
* remaining if an internal buffer is exhausted. The
|
||||
* folllowing is a heuristic to catch this. The 16 is
|
||||
* pretty arbitrary.
|
||||
*/
|
||||
if (used + 16 > outbuf_size)
|
||||
{
|
||||
outbuf_size = (outbuf_size - 1) * 2 + 1;
|
||||
dest = g_realloc (dest, outbuf_size);
|
||||
|
||||
outp = dest + used;
|
||||
outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
@ -458,18 +482,44 @@ g_convert_with_fallback (const gchar *str,
|
||||
/*
|
||||
* g_locale_to_utf8
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* g_locale_to_utf8:
|
||||
* @opsysstring: a string in the encoding of the current locale
|
||||
* @len: the length of the string, or -1 if the string is
|
||||
* NULL-terminated.
|
||||
* @bytes_read: location to store the number of bytes in the
|
||||
* input string that were successfully converted, or %NULL.
|
||||
* Even if the conversion was succesful, this may be
|
||||
* less than len if there were partial characters
|
||||
* at the end of the input. If the error
|
||||
* G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
|
||||
* stored will the byte fofset after the last valid
|
||||
* input sequence.
|
||||
* @bytes_written: the stored in the output buffer (not including the
|
||||
* terminating nul.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError may occur.
|
||||
*
|
||||
* Converts a string which is in the encoding used for strings by
|
||||
* the C runtime (usually the same as that used by the operating
|
||||
* system) in the current locale into a UTF-8 string.
|
||||
*/
|
||||
|
||||
*
|
||||
* Return value: The converted string, or %NULL on an error.
|
||||
**/
|
||||
gchar *
|
||||
g_locale_to_utf8 (const gchar *opsysstring, GError **error)
|
||||
g_locale_to_utf8 (const gchar *opsysstring,
|
||||
gint len,
|
||||
gint *bytes_read,
|
||||
gint *bytes_written,
|
||||
GError **error)
|
||||
{
|
||||
#ifdef G_OS_WIN32
|
||||
|
||||
gint i, clen, wclen, first;
|
||||
const gint len = strlen (opsysstring);
|
||||
gint i, clen, total_len, wclen, first;
|
||||
const gint len = len < 0 ? strlen (opsysstring) : len;
|
||||
wchar_t *wcs, wc;
|
||||
gchar *result, *bp;
|
||||
const wchar_t *wcp;
|
||||
@ -478,26 +528,26 @@ g_locale_to_utf8 (const gchar *opsysstring, GError **error)
|
||||
wclen = MultiByteToWideChar (CP_ACP, 0, opsysstring, len, wcs, len);
|
||||
|
||||
wcp = wcs;
|
||||
clen = 0;
|
||||
total_len = 0;
|
||||
for (i = 0; i < wclen; i++)
|
||||
{
|
||||
wc = *wcp++;
|
||||
|
||||
if (wc < 0x80)
|
||||
clen += 1;
|
||||
total_len += 1;
|
||||
else if (wc < 0x800)
|
||||
clen += 2;
|
||||
total_len += 2;
|
||||
else if (wc < 0x10000)
|
||||
clen += 3;
|
||||
total_len += 3;
|
||||
else if (wc < 0x200000)
|
||||
clen += 4;
|
||||
total_len += 4;
|
||||
else if (wc < 0x4000000)
|
||||
clen += 5;
|
||||
total_len += 5;
|
||||
else
|
||||
clen += 6;
|
||||
total_len += 6;
|
||||
}
|
||||
|
||||
result = g_malloc (clen + 1);
|
||||
result = g_malloc (total_len + 1);
|
||||
|
||||
wcp = wcs;
|
||||
bp = result;
|
||||
@ -553,6 +603,11 @@ g_locale_to_utf8 (const gchar *opsysstring, GError **error)
|
||||
|
||||
g_free (wcs);
|
||||
|
||||
if (bytes_read)
|
||||
*bytes_read = len;
|
||||
if (bytes_written)
|
||||
*bytes_written = total_len;
|
||||
|
||||
return result;
|
||||
|
||||
#else
|
||||
@ -562,26 +617,48 @@ g_locale_to_utf8 (const gchar *opsysstring, GError **error)
|
||||
if (g_get_charset (&charset))
|
||||
return g_strdup (opsysstring);
|
||||
|
||||
str = g_convert (opsysstring, strlen (opsysstring),
|
||||
"UTF-8", charset, NULL, NULL, error);
|
||||
str = g_convert (opsysstring, len,
|
||||
"UTF-8", charset, bytes_read, bytes_written, error);
|
||||
|
||||
return str;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* g_locale_from_utf8
|
||||
*
|
||||
* The reverse of g_locale_to_utf8.
|
||||
*/
|
||||
|
||||
/**
|
||||
* g_locale_from_utf8:
|
||||
* @utf8string: a UTF-8 encoded string
|
||||
* @len: the length of the string, or -1 if the string is
|
||||
* NULL-terminated.
|
||||
* @bytes_read: location to store the number of bytes in the
|
||||
* input string that were successfully converted, or %NULL.
|
||||
* Even if the conversion was succesful, this may be
|
||||
* less than len if there were partial characters
|
||||
* at the end of the input. If the error
|
||||
* G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
|
||||
* stored will the byte fofset after the last valid
|
||||
* input sequence.
|
||||
* @bytes_written: the stored in the output buffer (not including the
|
||||
* terminating nul.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError may occur.
|
||||
*
|
||||
* Converts a string from UTF-8 to the encoding used for strings by
|
||||
* the C runtime (usually the same as that used by the operating
|
||||
* system) in the current locale.
|
||||
*
|
||||
* Return value: The converted string, or %NULL on an error.
|
||||
**/
|
||||
gchar *
|
||||
g_locale_from_utf8 (const gchar *utf8string, GError **error)
|
||||
g_locale_from_utf8 (const gchar *utf8string,
|
||||
gint len,
|
||||
gint *bytes_read,
|
||||
gint *bytes_written,
|
||||
GError **error)
|
||||
{
|
||||
#ifdef G_OS_WIN32
|
||||
|
||||
gint i, mask, clen, mblen;
|
||||
const gint len = strlen (utf8string);
|
||||
const gint len = len < 0 ? strlen (utf8string) : len;
|
||||
wchar_t *wcs, *wcp;
|
||||
gchar *result;
|
||||
guchar *cp, *end, c;
|
||||
@ -671,6 +748,11 @@ g_locale_from_utf8 (const gchar *utf8string, GError **error)
|
||||
result[mblen] = 0;
|
||||
g_free (wcs);
|
||||
|
||||
if (bytes_read)
|
||||
*bytes_read = len;
|
||||
if (bytes_written)
|
||||
*bytes_written = mblen;
|
||||
|
||||
return result;
|
||||
|
||||
#else
|
||||
@ -681,39 +763,123 @@ g_locale_from_utf8 (const gchar *utf8string, GError **error)
|
||||
return g_strdup (utf8string);
|
||||
|
||||
str = g_convert (utf8string, strlen (utf8string),
|
||||
charset, "UTF-8", NULL, NULL, error);
|
||||
charset, "UTF-8", bytes_read, bytes_written, error);
|
||||
|
||||
return str;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Filenames are in UTF-8 unless specificially requested otherwise */
|
||||
|
||||
/**
|
||||
* g_filename_to_utf8:
|
||||
* @opsysstring: a string in the encoding for filenames
|
||||
* @len: the length of the string, or -1 if the string is
|
||||
* NULL-terminated.
|
||||
* @bytes_read: location to store the number of bytes in the
|
||||
* input string that were successfully converted, or %NULL.
|
||||
* Even if the conversion was succesful, this may be
|
||||
* less than len if there were partial characters
|
||||
* at the end of the input. If the error
|
||||
* G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
|
||||
* stored will the byte fofset after the last valid
|
||||
* input sequence.
|
||||
* @bytes_written: the stored in the output buffer (not including the
|
||||
* terminating nul.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError may occur.
|
||||
*
|
||||
* Converts a string which is in the encoding used for filenames
|
||||
* into a UTF-8 string.
|
||||
*
|
||||
* Return value: The converted string, or %NULL on an error.
|
||||
**/
|
||||
gchar*
|
||||
g_filename_to_utf8 (const gchar *string, GError **error)
|
||||
|
||||
g_filename_to_utf8 (const gchar *opsysstring,
|
||||
gint len,
|
||||
gint *bytes_read,
|
||||
gint *bytes_written,
|
||||
GError **error)
|
||||
{
|
||||
#ifdef G_OS_WIN32
|
||||
return g_locale_to_utf8 (string, error);
|
||||
return g_locale_to_utf8 (opsysstring, len,
|
||||
bytes_read, bytes_written,
|
||||
error);
|
||||
#else
|
||||
if (getenv ("G_BROKEN_FILENAMES"))
|
||||
return g_locale_to_utf8 (string, error);
|
||||
return g_locale_to_utf8 (opsysstring, len,
|
||||
bytes_read, bytes_written,
|
||||
error);
|
||||
|
||||
return g_strdup (string);
|
||||
if (bytes_read || bytes_written)
|
||||
{
|
||||
gint len = strlen (opsysstring);
|
||||
|
||||
if (bytes_read)
|
||||
*bytes_read = len;
|
||||
if (bytes_written)
|
||||
*bytes_written = len;
|
||||
}
|
||||
|
||||
if (len < 0)
|
||||
return g_strdup (opsysstring);
|
||||
else
|
||||
return g_strndup (opsysstring, len);
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* g_filename_from_utf8:
|
||||
* @utf8string: a UTF-8 encoded string
|
||||
* @len: the length of the string, or -1 if the string is
|
||||
* NULL-terminated.
|
||||
* @bytes_read: location to store the number of bytes in the
|
||||
* input string that were successfully converted, or %NULL.
|
||||
* Even if the conversion was succesful, this may be
|
||||
* less than len if there were partial characters
|
||||
* at the end of the input. If the error
|
||||
* G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
|
||||
* stored will the byte fofset after the last valid
|
||||
* input sequence.
|
||||
* @bytes_written: the stored in the output buffer (not including the
|
||||
* terminating nul.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError may occur.
|
||||
*
|
||||
* Converts a string from UTF-8 to the encoding used for filenames.
|
||||
*
|
||||
* Return value: The converted string, or %NULL on an error.
|
||||
**/
|
||||
gchar*
|
||||
g_filename_from_utf8 (const gchar *string, GError **error)
|
||||
g_filename_from_utf8 (const gchar *utf8string,
|
||||
gint len,
|
||||
gint *bytes_read,
|
||||
gint *bytes_written,
|
||||
GError **error)
|
||||
{
|
||||
#ifdef G_OS_WIN32
|
||||
return g_locale_from_utf8 (string, error);
|
||||
return g_locale_from_utf8 (utf8string, len,
|
||||
bytes_read, bytes_written,
|
||||
error);
|
||||
#else
|
||||
if (getenv ("G_BROKEN_FILENAMES"))
|
||||
return g_locale_from_utf8 (string, error);
|
||||
return g_locale_from_utf8 (utf8string, len,
|
||||
bytes_read, bytes_written,
|
||||
error);
|
||||
|
||||
return g_strdup (string);
|
||||
if (bytes_read || bytes_written)
|
||||
{
|
||||
gint len = strlen (utf8string);
|
||||
|
||||
if (bytes_read)
|
||||
*bytes_read = len;
|
||||
if (bytes_written)
|
||||
*bytes_written = len;
|
||||
}
|
||||
|
||||
if (len < 0)
|
||||
return g_strdup (utf8string);
|
||||
else
|
||||
return g_strndup (utf8string, len);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -76,14 +76,30 @@ gchar* g_convert_with_fallback (const gchar *str,
|
||||
|
||||
/* Convert between libc's idea of strings and UTF-8.
|
||||
*/
|
||||
gchar* g_locale_to_utf8 (const gchar *opsysstring, GError **error);
|
||||
gchar* g_locale_from_utf8 (const gchar *utf8string, GError **error);
|
||||
gchar* g_locale_to_utf8 (const gchar *opsysstring,
|
||||
gint len,
|
||||
gint *bytes_read,
|
||||
gint *bytes_written,
|
||||
GError **error);
|
||||
gchar* g_locale_from_utf8 (const gchar *utf8string,
|
||||
gint len,
|
||||
gint *bytes_read,
|
||||
gint *bytes_written,
|
||||
GError **error);
|
||||
|
||||
/* Convert between the operating system (or C runtime)
|
||||
* representation of file names and UTF-8.
|
||||
*/
|
||||
gchar* g_filename_to_utf8 (const gchar *opsysstring, GError **error);
|
||||
gchar* g_filename_from_utf8 (const gchar *utf8string, GError **error);
|
||||
gchar* g_filename_to_utf8 (const gchar *opsysstring,
|
||||
gint len,
|
||||
gint *bytes_read,
|
||||
gint *bytes_written,
|
||||
GError **error);
|
||||
gchar* g_filename_from_utf8 (const gchar *utf8string,
|
||||
gint len,
|
||||
gint *bytes_read,
|
||||
gint *bytes_written,
|
||||
GError **error);
|
||||
|
||||
G_END_DECLS
|
||||
|
||||
|
@ -206,18 +206,39 @@ gchar *g_utf8_strchr (const gchar *p,
|
||||
gchar *g_utf8_strrchr (const gchar *p,
|
||||
gunichar c);
|
||||
|
||||
gunichar2 *g_utf8_to_utf16 (const gchar *str,
|
||||
gint len);
|
||||
gunichar * g_utf8_to_ucs4 (const gchar *str,
|
||||
gint len);
|
||||
gunichar * g_utf16_to_ucs4 (const gunichar2 *str,
|
||||
gint len);
|
||||
gchar * g_utf16_to_utf8 (const gunichar2 *str,
|
||||
gint len);
|
||||
gunichar * g_ucs4_to_utf16 (const gunichar *str,
|
||||
gint len);
|
||||
gchar * g_ucs4_to_utf8 (const gunichar *str,
|
||||
gint len);
|
||||
gunichar2 *g_utf8_to_utf16 (const gchar *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error);
|
||||
gunichar * g_utf8_to_ucs4 (const gchar *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error);
|
||||
gunichar * g_utf8_to_ucs4_fast (const gchar *str,
|
||||
gint len,
|
||||
gint *items_written);
|
||||
gunichar * g_utf16_to_ucs4 (const gunichar2 *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error);
|
||||
gchar * g_utf16_to_utf8 (const gunichar2 *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error);
|
||||
gunichar2 *g_ucs4_to_utf16 (const gunichar *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error);
|
||||
gchar * g_ucs4_to_utf8 (const gunichar *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error);
|
||||
|
||||
/* Convert a single character into UTF-8. outbuf must have at
|
||||
* least 6 bytes of space. Returns the number of bytes in the
|
||||
|
840
glib/gutf8.c
840
glib/gutf8.c
@ -33,6 +33,8 @@
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#define _(s) (s)
|
||||
|
||||
#define UTF8_COMPUTE(Char, Mask, Len) \
|
||||
if (Char < 128) \
|
||||
{ \
|
||||
@ -67,6 +69,14 @@
|
||||
else \
|
||||
Len = -1;
|
||||
|
||||
#define UTF8_LENGTH(Char) \
|
||||
((Char) < 0x80 ? 1 : \
|
||||
((Char) < 0x800 ? 2 : \
|
||||
((Char) < 0x10000 ? 3 : \
|
||||
((Char) < 0x200000 ? 4 : \
|
||||
((Char) < 0x4000000 ? 5 : 6)))))
|
||||
|
||||
|
||||
#define UTF8_GET(Result, Chars, Count, Mask, Len) \
|
||||
(Result) = (Chars)[0] & (Mask); \
|
||||
for ((Count) = 1; (Count) < (Len); ++(Count)) \
|
||||
@ -79,6 +89,13 @@
|
||||
(Result) <<= 6; \
|
||||
(Result) |= ((Chars)[(Count)] & 0x3f); \
|
||||
}
|
||||
|
||||
#define UNICODE_VALID(Char) \
|
||||
((Char) < 0x110000 && \
|
||||
((Char) < 0xD800 || (Char) >= 0xE000) && \
|
||||
(Char) != 0xFFFE && (Char) != 0xFFFF)
|
||||
|
||||
|
||||
gchar g_utf8_skip[256] = {
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
@ -473,33 +490,272 @@ unicode_strrchr (const char *p, gunichar c)
|
||||
#endif
|
||||
|
||||
|
||||
/* Like g_utf8_get_char, but take a maximum length
|
||||
* and return (gunichar)-2 on incomplete trailing character
|
||||
*/
|
||||
static inline gunichar
|
||||
g_utf8_get_char_extended (const gchar *p, int max_len)
|
||||
{
|
||||
gint i, len;
|
||||
gunichar wc = (guchar) *p;
|
||||
|
||||
if (wc < 0x80)
|
||||
{
|
||||
return wc;
|
||||
}
|
||||
else if (wc < 0xc0)
|
||||
{
|
||||
return (gunichar)-1;
|
||||
}
|
||||
else if (wc < 0xe0)
|
||||
{
|
||||
len = 2;
|
||||
wc &= 0x1f;
|
||||
}
|
||||
else if (wc < 0xf0)
|
||||
{
|
||||
len = 3;
|
||||
wc &= 0x0f;
|
||||
}
|
||||
else if (wc < 0xf8)
|
||||
{
|
||||
len = 4;
|
||||
wc &= 0x07;
|
||||
}
|
||||
else if (wc < 0xfc)
|
||||
{
|
||||
len = 5;
|
||||
wc &= 0x03;
|
||||
}
|
||||
else if (wc < 0xfe)
|
||||
{
|
||||
len = 6;
|
||||
wc &= 0x01;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (gunichar)-1;
|
||||
}
|
||||
|
||||
if (len == -1)
|
||||
return (gunichar)-1;
|
||||
if (max_len >= 0 && len > max_len)
|
||||
{
|
||||
for (i = 1; i < max_len; i++)
|
||||
{
|
||||
if ((((guchar *)p)[i] & 0xc0) != 0x80)
|
||||
return (gunichar)-1;
|
||||
}
|
||||
return (gunichar)-2;
|
||||
}
|
||||
|
||||
for (i = 1; i < len; ++i)
|
||||
{
|
||||
gunichar ch = ((guchar *)p)[i];
|
||||
|
||||
if ((ch & 0xc0) != 0x80)
|
||||
{
|
||||
if (ch)
|
||||
return (gunichar)-1;
|
||||
else
|
||||
return (gunichar)-2;
|
||||
}
|
||||
|
||||
wc <<= 6;
|
||||
wc |= (ch & 0x3f);
|
||||
}
|
||||
|
||||
if (UTF8_LENGTH(wc) != len)
|
||||
return (gunichar)-1;
|
||||
|
||||
return wc;
|
||||
}
|
||||
|
||||
/**
|
||||
* g_utf8_to_ucs4:
|
||||
* @str: a UTF-8 encoded strnig
|
||||
* @len: the length of @
|
||||
*
|
||||
* g_utf8_to_ucs4_fast:
|
||||
* @str: a UTF-8 encoded string
|
||||
* @len: the maximum length of @str to use. If < 0, then
|
||||
* the string is %NULL terminated.
|
||||
* @items_written: location to store the number of characters in the
|
||||
* result, or %NULL.
|
||||
*
|
||||
* Convert a string from UTF-8 to a 32-bit fixed width
|
||||
* representation as UCS-4.
|
||||
* representation as UCS-4, assuming valid UTF-8 input.
|
||||
* This function is roughly twice as fast as g_utf8_to_ucs4()
|
||||
* but does no error checking on the input.
|
||||
*
|
||||
* Return value: a pointer to a newly allocated UCS-4 string.
|
||||
* This value must be freed with g_free()
|
||||
**/
|
||||
gunichar *
|
||||
g_utf8_to_ucs4 (const char *str, int len)
|
||||
g_utf8_to_ucs4_fast (const gchar *str,
|
||||
gint len,
|
||||
gint *items_written)
|
||||
{
|
||||
gint j, charlen;
|
||||
gunichar *result;
|
||||
gint n_chars, i;
|
||||
const gchar *p;
|
||||
|
||||
g_return_val_if_fail (str != NULL, NULL);
|
||||
|
||||
p = str;
|
||||
n_chars = 0;
|
||||
if (len < 0)
|
||||
{
|
||||
while (*p)
|
||||
{
|
||||
p = g_utf8_next_char (p);
|
||||
++n_chars;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
while (*p && p < str + len)
|
||||
{
|
||||
p = g_utf8_next_char (p);
|
||||
++n_chars;
|
||||
}
|
||||
}
|
||||
|
||||
n_chars = g_utf8_strlen (str, len);
|
||||
result = g_new (gunichar, n_chars);
|
||||
result = g_new (gunichar, n_chars + 1);
|
||||
|
||||
p = str;
|
||||
for (i=0; i < n_chars; i++)
|
||||
{
|
||||
result[i] = g_utf8_get_char (p);
|
||||
p = g_utf8_next_char (p);
|
||||
gunichar wc = ((unsigned char *)p)[0];
|
||||
|
||||
if (wc < 0x80)
|
||||
{
|
||||
result[i] = wc;
|
||||
p++;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (wc < 0xe0)
|
||||
{
|
||||
charlen = 2;
|
||||
wc &= 0x1f;
|
||||
}
|
||||
else if (wc < 0xf0)
|
||||
{
|
||||
charlen = 3;
|
||||
wc &= 0x0f;
|
||||
}
|
||||
else if (wc < 0xf8)
|
||||
{
|
||||
charlen = 4;
|
||||
wc &= 0x07;
|
||||
}
|
||||
else if (wc < 0xfc)
|
||||
{
|
||||
charlen = 5;
|
||||
wc &= 0x03;
|
||||
}
|
||||
else
|
||||
{
|
||||
charlen = 6;
|
||||
wc &= 0x01;
|
||||
}
|
||||
|
||||
for (j = 1; j < charlen; j++)
|
||||
{
|
||||
wc <<= 6;
|
||||
wc |= ((unsigned char *)p)[j] & 0x3f;
|
||||
}
|
||||
|
||||
result[i] = wc;
|
||||
p += charlen;
|
||||
}
|
||||
}
|
||||
result[i] = 0;
|
||||
|
||||
if (items_written)
|
||||
*items_written = i;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* g_utf8_to_ucs4:
|
||||
* @str: a UTF-8 encoded string
|
||||
* @len: the maximum length of @str to use. If < 0, then
|
||||
* the string is %NULL terminated.
|
||||
* @items_read: location to store number of bytes read, or %NULL.
|
||||
* If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
|
||||
* returned in case @str contains a trailing partial
|
||||
* character. If an error occurs then the index of the
|
||||
* invalid input is stored here.
|
||||
* @items_written: location to store number of characters written or %NULL.
|
||||
* The value here stored does not include the trailing 0
|
||||
* character.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError other than
|
||||
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
|
||||
*
|
||||
* Convert a string from UTF-8 to a 32-bit fixed width
|
||||
* representation as UCS-4. A trailing 0 will be added to the
|
||||
* string after the converted text.
|
||||
*
|
||||
* Return value: a pointer to a newly allocated UCS-4 string.
|
||||
* This value must be freed with g_free(). If an
|
||||
* error occurs, %NULL will be returned and
|
||||
* @error set.
|
||||
**/
|
||||
gunichar *
|
||||
g_utf8_to_ucs4 (const gchar *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error)
|
||||
{
|
||||
gunichar *result = NULL;
|
||||
gint n_chars, i;
|
||||
const gchar *in;
|
||||
|
||||
in = str;
|
||||
n_chars = 0;
|
||||
while ((len < 0 || str + len - in > 0) && *in)
|
||||
{
|
||||
gunichar wc = g_utf8_get_char_extended (in, str + len - in);
|
||||
if (wc & 0x80000000)
|
||||
{
|
||||
if (wc == (gunichar)-2)
|
||||
{
|
||||
if (items_read)
|
||||
break;
|
||||
else
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
|
||||
_("Partial character sequence at end of input"));
|
||||
}
|
||||
else
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Invalid byte sequence in conversion input"));
|
||||
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
n_chars++;
|
||||
|
||||
in = g_utf8_next_char (in);
|
||||
}
|
||||
|
||||
result = g_new (gunichar, n_chars + 1);
|
||||
|
||||
in = str;
|
||||
for (i=0; i < n_chars; i++)
|
||||
{
|
||||
result[i] = g_utf8_get_char (in);
|
||||
in = g_utf8_next_char (in);
|
||||
}
|
||||
result[i] = 0;
|
||||
|
||||
if (items_written)
|
||||
*items_written = n_chars;
|
||||
|
||||
err_out:
|
||||
if (items_read)
|
||||
*items_read = in - str;
|
||||
|
||||
return result;
|
||||
}
|
||||
@ -507,35 +763,569 @@ g_utf8_to_ucs4 (const char *str, int len)
|
||||
/**
|
||||
* g_ucs4_to_utf8:
|
||||
* @str: a UCS-4 encoded string
|
||||
* @len: the length of @
|
||||
*
|
||||
* @len: the maximum length of @str to use. If < 0, then
|
||||
* the string is %NULL terminated.
|
||||
* @items_read: location to store number of characters read read, or %NULL.
|
||||
* @items_written: location to store number of bytes written or %NULL.
|
||||
* The value here stored does not include the trailing 0
|
||||
* byte.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError other than
|
||||
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
|
||||
*
|
||||
* Convert a string from a 32-bit fixed width representation as UCS-4.
|
||||
* to UTF-8.
|
||||
* to UTF-8. The result will be terminated with a 0 byte.
|
||||
*
|
||||
* Return value: a pointer to a newly allocated UTF-8 string.
|
||||
* This value must be freed with g_free()
|
||||
* This value must be freed with g_free(). If an
|
||||
* error occurs, %NULL will be returned and
|
||||
* @error set.
|
||||
**/
|
||||
gchar *
|
||||
g_ucs4_to_utf8 (const gunichar *str, int len)
|
||||
g_ucs4_to_utf8 (const gunichar *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error)
|
||||
{
|
||||
gint result_length;
|
||||
gchar *result, *p;
|
||||
gchar *result = NULL;
|
||||
gchar *p;
|
||||
gint i;
|
||||
|
||||
result_length = 0;
|
||||
for (i = 0; i < len ; i++)
|
||||
result_length += g_unichar_to_utf8 (str[i], NULL);
|
||||
for (i = 0; len < 0 || i < len ; i++)
|
||||
{
|
||||
if (!str[i])
|
||||
break;
|
||||
|
||||
result_length++;
|
||||
if (str[i] >= 0x80000000)
|
||||
{
|
||||
if (items_read)
|
||||
*items_read = i;
|
||||
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Character out of range for UTF-8"));
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
result_length += UTF8_LENGTH (str[i]);
|
||||
}
|
||||
|
||||
result = g_malloc (result_length + 1);
|
||||
p = result;
|
||||
|
||||
for (i = 0; i < len ; i++)
|
||||
p += g_unichar_to_utf8 (str[i], p);
|
||||
i = 0;
|
||||
while (p < result + result_length)
|
||||
p += g_unichar_to_utf8 (str[i++], p);
|
||||
|
||||
*p = '\0';
|
||||
|
||||
if (items_written)
|
||||
*items_written = p - result;
|
||||
|
||||
err_out:
|
||||
if (items_read)
|
||||
*items_read = i;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
|
||||
|
||||
/**
|
||||
* g_utf16_to_utf8:
|
||||
* @str: a UTF-16 encoded string
|
||||
* @len: the maximum length of @str to use. If < 0, then
|
||||
* the string is terminated with a 0 character.
|
||||
* @items_read: location to store number of words read, or %NULL.
|
||||
* If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
|
||||
* returned in case @str contains a trailing partial
|
||||
* character. If an error occurs then the index of the
|
||||
* invalid input is stored here.
|
||||
* @items_written: location to store number of bytes written, or %NULL.
|
||||
* The value stored here does not include the trailing
|
||||
* 0 byte.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError other than
|
||||
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
|
||||
*
|
||||
* Convert a string from UTF-16 to UTF-8. The result will be
|
||||
* terminated with a 0 byte.
|
||||
*
|
||||
* Return value: a pointer to a newly allocated UTF-8 string.
|
||||
* This value must be freed with g_free(). If an
|
||||
* error occurs, %NULL will be returned and
|
||||
* @error set.
|
||||
**/
|
||||
gchar *
|
||||
g_utf16_to_utf8 (const gunichar2 *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error)
|
||||
{
|
||||
/* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ
|
||||
* are marked.
|
||||
*/
|
||||
const gunichar2 *in;
|
||||
gchar *out;
|
||||
gchar *result = NULL;
|
||||
gint n_bytes;
|
||||
gunichar high_surrogate;
|
||||
|
||||
g_return_val_if_fail (str != 0, NULL);
|
||||
|
||||
n_bytes = 0;
|
||||
in = str;
|
||||
high_surrogate = 0;
|
||||
while ((len < 0 || in - str < len) && *in)
|
||||
{
|
||||
gunichar2 c = *in;
|
||||
gunichar wc;
|
||||
|
||||
if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
|
||||
{
|
||||
if (high_surrogate)
|
||||
{
|
||||
wc = SURROGATE_VALUE (high_surrogate, c);
|
||||
high_surrogate = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Invalid sequence in conversion input"));
|
||||
goto err_out;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (high_surrogate)
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Invalid sequence in conversion input"));
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
|
||||
{
|
||||
high_surrogate = c;
|
||||
goto next1;
|
||||
}
|
||||
else
|
||||
wc = c;
|
||||
}
|
||||
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
n_bytes += UTF8_LENGTH (wc);
|
||||
|
||||
next1:
|
||||
in++;
|
||||
}
|
||||
|
||||
if (high_surrogate && !items_read)
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
|
||||
_("Partial character sequence at end of input"));
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
/* At this point, everything is valid, and we just need to convert
|
||||
*/
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
result = g_malloc (n_bytes + 1);
|
||||
|
||||
high_surrogate = 0;
|
||||
out = result;
|
||||
in = str;
|
||||
while (out < result + n_bytes)
|
||||
{
|
||||
gunichar2 c = *in;
|
||||
gunichar wc;
|
||||
|
||||
if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
|
||||
{
|
||||
wc = SURROGATE_VALUE (high_surrogate, c);
|
||||
high_surrogate = 0;
|
||||
}
|
||||
else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
|
||||
{
|
||||
high_surrogate = c;
|
||||
goto next2;
|
||||
}
|
||||
else
|
||||
wc = c;
|
||||
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
out += g_unichar_to_utf8 (wc, out);
|
||||
|
||||
next2:
|
||||
in++;
|
||||
}
|
||||
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
*out = '\0';
|
||||
|
||||
if (items_written)
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
*items_written = out - result;
|
||||
|
||||
err_out:
|
||||
if (items_read)
|
||||
*items_read = in - str;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* g_utf16_to_ucs4:
|
||||
* @str: a UTF-16 encoded string
|
||||
* @len: the maximum length of @str to use. If < 0, then
|
||||
* the string is terminated with a 0 character.
|
||||
* @items_read: location to store number of words read, or %NULL.
|
||||
* If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
|
||||
* returned in case @str contains a trailing partial
|
||||
* character. If an error occurs then the index of the
|
||||
* invalid input is stored here.
|
||||
* @items_written: location to store number of characters written, or %NULL.
|
||||
* The value stored here does not include the trailing
|
||||
* 0 character.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError other than
|
||||
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
|
||||
*
|
||||
* Convert a string from UTF-16 to UCS-4. The result will be
|
||||
* terminated with a 0 character.
|
||||
*
|
||||
* Return value: a pointer to a newly allocated UCS-4 string.
|
||||
* This value must be freed with g_free(). If an
|
||||
* error occurs, %NULL will be returned and
|
||||
* @error set.
|
||||
**/
|
||||
gunichar *
|
||||
g_utf16_to_ucs4 (const gunichar2 *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error)
|
||||
{
|
||||
const gunichar2 *in;
|
||||
gchar *out;
|
||||
gchar *result = NULL;
|
||||
gint n_bytes;
|
||||
gunichar high_surrogate;
|
||||
|
||||
g_return_val_if_fail (str != 0, NULL);
|
||||
|
||||
n_bytes = 0;
|
||||
in = str;
|
||||
high_surrogate = 0;
|
||||
while ((len < 0 || in - str < len) && *in)
|
||||
{
|
||||
gunichar2 c = *in;
|
||||
gunichar wc;
|
||||
|
||||
if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
|
||||
{
|
||||
if (high_surrogate)
|
||||
{
|
||||
wc = SURROGATE_VALUE (high_surrogate, c);
|
||||
high_surrogate = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Invalid sequence in conversion input"));
|
||||
goto err_out;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (high_surrogate)
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Invalid sequence in conversion input"));
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
|
||||
{
|
||||
high_surrogate = c;
|
||||
goto next1;
|
||||
}
|
||||
else
|
||||
wc = c;
|
||||
}
|
||||
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
n_bytes += sizeof (gunichar);
|
||||
|
||||
next1:
|
||||
in++;
|
||||
}
|
||||
|
||||
if (high_surrogate && !items_read)
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
|
||||
_("Partial character sequence at end of input"));
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
/* At this point, everything is valid, and we just need to convert
|
||||
*/
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
result = g_malloc (n_bytes + 4);
|
||||
|
||||
high_surrogate = 0;
|
||||
out = result;
|
||||
in = str;
|
||||
while (out < result + n_bytes)
|
||||
{
|
||||
gunichar2 c = *in;
|
||||
gunichar wc;
|
||||
|
||||
if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
|
||||
{
|
||||
wc = SURROGATE_VALUE (high_surrogate, c);
|
||||
high_surrogate = 0;
|
||||
}
|
||||
else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
|
||||
{
|
||||
high_surrogate = c;
|
||||
goto next2;
|
||||
}
|
||||
else
|
||||
wc = c;
|
||||
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
*(gunichar *)out = wc;
|
||||
out += sizeof (gunichar);
|
||||
|
||||
next2:
|
||||
in++;
|
||||
}
|
||||
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
*(gunichar *)out = 0;
|
||||
|
||||
if (items_written)
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
*items_written = (out - result) / sizeof (gunichar);
|
||||
|
||||
err_out:
|
||||
if (items_read)
|
||||
*items_read = in - str;
|
||||
|
||||
return (gunichar *)result;
|
||||
}
|
||||
|
||||
/**
|
||||
* g_utf8_to_utf16:
|
||||
* @str: a UTF-8 encoded string
|
||||
* @len: the maximum length of @str to use. If < 0, then
|
||||
* the string is %NULL terminated.
|
||||
|
||||
* @items_read: location to store number of bytes read, or %NULL.
|
||||
* If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
|
||||
* returned in case @str contains a trailing partial
|
||||
* character. If an error occurs then the index of the
|
||||
* invalid input is stored here.
|
||||
* @items_written: location to store number of words written, or %NULL.
|
||||
* The value stored here does not include the trailing
|
||||
* 0 word.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError other than
|
||||
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
|
||||
*
|
||||
* Convert a string from UTF-8 to UTF-16. A 0 word will be
|
||||
* added to the result after the converted text.
|
||||
*
|
||||
* Return value: a pointer to a newly allocated UTF-16 string.
|
||||
* This value must be freed with g_free(). If an
|
||||
* error occurs, %NULL will be returned and
|
||||
* @error set.
|
||||
**/
|
||||
gunichar2 *
|
||||
g_utf8_to_utf16 (const gchar *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error)
|
||||
{
|
||||
gunichar2 *result = NULL;
|
||||
gint n16;
|
||||
const gchar *in;
|
||||
gint i;
|
||||
|
||||
g_return_val_if_fail (str != NULL, NULL);
|
||||
|
||||
in = str;
|
||||
n16 = 0;
|
||||
while ((len < 0 || str + len - in > 0) && *in)
|
||||
{
|
||||
gunichar wc = g_utf8_get_char_extended (in, str + len - in);
|
||||
if (wc & 0x80000000)
|
||||
{
|
||||
if (wc == (gunichar)-2)
|
||||
{
|
||||
if (items_read)
|
||||
break;
|
||||
else
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
|
||||
_("Partial character sequence at end of input"));
|
||||
}
|
||||
else
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Invalid byte sequence in conversion input"));
|
||||
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
if (wc < 0xd800)
|
||||
n16 += 1;
|
||||
else if (wc < 0xe000)
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Invalid sequence in conversion input"));
|
||||
|
||||
goto err_out;
|
||||
}
|
||||
else if (wc < 0x10000)
|
||||
n16 += 1;
|
||||
else if (wc < 0x110000)
|
||||
n16 += 2;
|
||||
else
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Character out of range for UTF-16"));
|
||||
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
in = g_utf8_next_char (in);
|
||||
}
|
||||
|
||||
result = g_new (gunichar2, n16 + 1);
|
||||
|
||||
in = str;
|
||||
for (i = 0; i < n16;)
|
||||
{
|
||||
gunichar wc = g_utf8_get_char (in);
|
||||
|
||||
if (wc < 0x10000)
|
||||
{
|
||||
result[i++] = wc;
|
||||
}
|
||||
else
|
||||
{
|
||||
result[i++] = (wc - 0x10000) / 0x400 + 0xd800;
|
||||
result[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
|
||||
}
|
||||
|
||||
in = g_utf8_next_char (in);
|
||||
}
|
||||
|
||||
result[i] = 0;
|
||||
|
||||
if (items_written)
|
||||
*items_written = n16;
|
||||
|
||||
err_out:
|
||||
if (items_read)
|
||||
*items_read = in - str;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* g_ucs4_to_utf16:
|
||||
* @str: a UCS-4 encoded string
|
||||
* @len: the maximum length of @str to use. If < 0, then
|
||||
* the string is terminated with a zero character.
|
||||
* @items_read: location to store number of bytes read, or %NULL.
|
||||
* If an error occurs then the index of the invalid input
|
||||
* is stored here.
|
||||
* @items_written: location to store number of words written, or %NULL.
|
||||
* The value stored here does not include the trailing
|
||||
* 0 word.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError other than
|
||||
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
|
||||
*
|
||||
* Convert a string from UCS-4 to UTF-16. A 0 word will be
|
||||
* added to the result after the converted text.
|
||||
*
|
||||
* Return value: a pointer to a newly allocated UTF-16 string.
|
||||
* This value must be freed with g_free(). If an
|
||||
* error occurs, %NULL will be returned and
|
||||
* @error set.
|
||||
**/
|
||||
gunichar2 *
|
||||
g_ucs4_to_utf16 (const gunichar *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error)
|
||||
{
|
||||
gunichar2 *result = NULL;
|
||||
gint n16;
|
||||
gint i, j;
|
||||
|
||||
n16 = 0;
|
||||
i = 0;
|
||||
while ((len < 0 || i < len) && str[i])
|
||||
{
|
||||
gunichar wc = str[i];
|
||||
|
||||
if (wc < 0xd800)
|
||||
n16 += 1;
|
||||
else if (wc < 0xe000)
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Invalid sequence in conversion input"));
|
||||
|
||||
goto err_out;
|
||||
}
|
||||
else if (wc < 0x10000)
|
||||
n16 += 1;
|
||||
else if (wc < 0x110000)
|
||||
n16 += 2;
|
||||
else
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Character out of range for UTF-16"));
|
||||
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
result = g_new (gunichar2, n16 + 1);
|
||||
|
||||
for (i = 0, j = 0; j < n16; i++)
|
||||
{
|
||||
gunichar wc = str[i];
|
||||
|
||||
if (wc < 0x10000)
|
||||
{
|
||||
result[j++] = wc;
|
||||
}
|
||||
else
|
||||
{
|
||||
result[j++] = (wc - 0x10000) / 0x400 + 0xd800;
|
||||
result[j++] = (wc - 0x10000) % 0x400 + 0xdc00;
|
||||
}
|
||||
}
|
||||
result[j] = 0;
|
||||
|
||||
if (items_written)
|
||||
*items_written = n16;
|
||||
|
||||
err_out:
|
||||
if (items_read)
|
||||
*items_read = i;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -567,6 +1357,8 @@ g_utf8_validate (const gchar *str,
|
||||
{
|
||||
|
||||
const gchar *p;
|
||||
|
||||
g_return_val_if_fail (str != NULL, FALSE);
|
||||
|
||||
if (end)
|
||||
*end = str;
|
||||
@ -591,8 +1383,14 @@ g_utf8_validate (const gchar *str,
|
||||
|
||||
UTF8_GET (result, p, i, mask, len);
|
||||
|
||||
if (UTF8_LENGTH (result) != len) /* Check for overlong UTF-8 */
|
||||
break;
|
||||
|
||||
if (result == (gunichar)-1)
|
||||
break;
|
||||
|
||||
if (!UNICODE_VALID (result))
|
||||
break;
|
||||
|
||||
p += len;
|
||||
}
|
||||
|
45
gunicode.h
45
gunicode.h
@ -206,18 +206,39 @@ gchar *g_utf8_strchr (const gchar *p,
|
||||
gchar *g_utf8_strrchr (const gchar *p,
|
||||
gunichar c);
|
||||
|
||||
gunichar2 *g_utf8_to_utf16 (const gchar *str,
|
||||
gint len);
|
||||
gunichar * g_utf8_to_ucs4 (const gchar *str,
|
||||
gint len);
|
||||
gunichar * g_utf16_to_ucs4 (const gunichar2 *str,
|
||||
gint len);
|
||||
gchar * g_utf16_to_utf8 (const gunichar2 *str,
|
||||
gint len);
|
||||
gunichar * g_ucs4_to_utf16 (const gunichar *str,
|
||||
gint len);
|
||||
gchar * g_ucs4_to_utf8 (const gunichar *str,
|
||||
gint len);
|
||||
gunichar2 *g_utf8_to_utf16 (const gchar *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error);
|
||||
gunichar * g_utf8_to_ucs4 (const gchar *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error);
|
||||
gunichar * g_utf8_to_ucs4_fast (const gchar *str,
|
||||
gint len,
|
||||
gint *items_written);
|
||||
gunichar * g_utf16_to_ucs4 (const gunichar2 *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error);
|
||||
gchar * g_utf16_to_utf8 (const gunichar2 *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error);
|
||||
gunichar2 *g_ucs4_to_utf16 (const gunichar *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error);
|
||||
gchar * g_ucs4_to_utf8 (const gunichar *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error);
|
||||
|
||||
/* Convert a single character into UTF-8. outbuf must have at
|
||||
* least 6 bytes of space. Returns the number of bytes in the
|
||||
|
840
gutf8.c
840
gutf8.c
@ -33,6 +33,8 @@
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#define _(s) (s)
|
||||
|
||||
#define UTF8_COMPUTE(Char, Mask, Len) \
|
||||
if (Char < 128) \
|
||||
{ \
|
||||
@ -67,6 +69,14 @@
|
||||
else \
|
||||
Len = -1;
|
||||
|
||||
#define UTF8_LENGTH(Char) \
|
||||
((Char) < 0x80 ? 1 : \
|
||||
((Char) < 0x800 ? 2 : \
|
||||
((Char) < 0x10000 ? 3 : \
|
||||
((Char) < 0x200000 ? 4 : \
|
||||
((Char) < 0x4000000 ? 5 : 6)))))
|
||||
|
||||
|
||||
#define UTF8_GET(Result, Chars, Count, Mask, Len) \
|
||||
(Result) = (Chars)[0] & (Mask); \
|
||||
for ((Count) = 1; (Count) < (Len); ++(Count)) \
|
||||
@ -79,6 +89,13 @@
|
||||
(Result) <<= 6; \
|
||||
(Result) |= ((Chars)[(Count)] & 0x3f); \
|
||||
}
|
||||
|
||||
#define UNICODE_VALID(Char) \
|
||||
((Char) < 0x110000 && \
|
||||
((Char) < 0xD800 || (Char) >= 0xE000) && \
|
||||
(Char) != 0xFFFE && (Char) != 0xFFFF)
|
||||
|
||||
|
||||
gchar g_utf8_skip[256] = {
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
@ -473,33 +490,272 @@ unicode_strrchr (const char *p, gunichar c)
|
||||
#endif
|
||||
|
||||
|
||||
/* Like g_utf8_get_char, but take a maximum length
|
||||
* and return (gunichar)-2 on incomplete trailing character
|
||||
*/
|
||||
static inline gunichar
|
||||
g_utf8_get_char_extended (const gchar *p, int max_len)
|
||||
{
|
||||
gint i, len;
|
||||
gunichar wc = (guchar) *p;
|
||||
|
||||
if (wc < 0x80)
|
||||
{
|
||||
return wc;
|
||||
}
|
||||
else if (wc < 0xc0)
|
||||
{
|
||||
return (gunichar)-1;
|
||||
}
|
||||
else if (wc < 0xe0)
|
||||
{
|
||||
len = 2;
|
||||
wc &= 0x1f;
|
||||
}
|
||||
else if (wc < 0xf0)
|
||||
{
|
||||
len = 3;
|
||||
wc &= 0x0f;
|
||||
}
|
||||
else if (wc < 0xf8)
|
||||
{
|
||||
len = 4;
|
||||
wc &= 0x07;
|
||||
}
|
||||
else if (wc < 0xfc)
|
||||
{
|
||||
len = 5;
|
||||
wc &= 0x03;
|
||||
}
|
||||
else if (wc < 0xfe)
|
||||
{
|
||||
len = 6;
|
||||
wc &= 0x01;
|
||||
}
|
||||
else
|
||||
{
|
||||
return (gunichar)-1;
|
||||
}
|
||||
|
||||
if (len == -1)
|
||||
return (gunichar)-1;
|
||||
if (max_len >= 0 && len > max_len)
|
||||
{
|
||||
for (i = 1; i < max_len; i++)
|
||||
{
|
||||
if ((((guchar *)p)[i] & 0xc0) != 0x80)
|
||||
return (gunichar)-1;
|
||||
}
|
||||
return (gunichar)-2;
|
||||
}
|
||||
|
||||
for (i = 1; i < len; ++i)
|
||||
{
|
||||
gunichar ch = ((guchar *)p)[i];
|
||||
|
||||
if ((ch & 0xc0) != 0x80)
|
||||
{
|
||||
if (ch)
|
||||
return (gunichar)-1;
|
||||
else
|
||||
return (gunichar)-2;
|
||||
}
|
||||
|
||||
wc <<= 6;
|
||||
wc |= (ch & 0x3f);
|
||||
}
|
||||
|
||||
if (UTF8_LENGTH(wc) != len)
|
||||
return (gunichar)-1;
|
||||
|
||||
return wc;
|
||||
}
|
||||
|
||||
/**
|
||||
* g_utf8_to_ucs4:
|
||||
* @str: a UTF-8 encoded strnig
|
||||
* @len: the length of @
|
||||
*
|
||||
* g_utf8_to_ucs4_fast:
|
||||
* @str: a UTF-8 encoded string
|
||||
* @len: the maximum length of @str to use. If < 0, then
|
||||
* the string is %NULL terminated.
|
||||
* @items_written: location to store the number of characters in the
|
||||
* result, or %NULL.
|
||||
*
|
||||
* Convert a string from UTF-8 to a 32-bit fixed width
|
||||
* representation as UCS-4.
|
||||
* representation as UCS-4, assuming valid UTF-8 input.
|
||||
* This function is roughly twice as fast as g_utf8_to_ucs4()
|
||||
* but does no error checking on the input.
|
||||
*
|
||||
* Return value: a pointer to a newly allocated UCS-4 string.
|
||||
* This value must be freed with g_free()
|
||||
**/
|
||||
gunichar *
|
||||
g_utf8_to_ucs4 (const char *str, int len)
|
||||
g_utf8_to_ucs4_fast (const gchar *str,
|
||||
gint len,
|
||||
gint *items_written)
|
||||
{
|
||||
gint j, charlen;
|
||||
gunichar *result;
|
||||
gint n_chars, i;
|
||||
const gchar *p;
|
||||
|
||||
g_return_val_if_fail (str != NULL, NULL);
|
||||
|
||||
p = str;
|
||||
n_chars = 0;
|
||||
if (len < 0)
|
||||
{
|
||||
while (*p)
|
||||
{
|
||||
p = g_utf8_next_char (p);
|
||||
++n_chars;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
while (*p && p < str + len)
|
||||
{
|
||||
p = g_utf8_next_char (p);
|
||||
++n_chars;
|
||||
}
|
||||
}
|
||||
|
||||
n_chars = g_utf8_strlen (str, len);
|
||||
result = g_new (gunichar, n_chars);
|
||||
result = g_new (gunichar, n_chars + 1);
|
||||
|
||||
p = str;
|
||||
for (i=0; i < n_chars; i++)
|
||||
{
|
||||
result[i] = g_utf8_get_char (p);
|
||||
p = g_utf8_next_char (p);
|
||||
gunichar wc = ((unsigned char *)p)[0];
|
||||
|
||||
if (wc < 0x80)
|
||||
{
|
||||
result[i] = wc;
|
||||
p++;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (wc < 0xe0)
|
||||
{
|
||||
charlen = 2;
|
||||
wc &= 0x1f;
|
||||
}
|
||||
else if (wc < 0xf0)
|
||||
{
|
||||
charlen = 3;
|
||||
wc &= 0x0f;
|
||||
}
|
||||
else if (wc < 0xf8)
|
||||
{
|
||||
charlen = 4;
|
||||
wc &= 0x07;
|
||||
}
|
||||
else if (wc < 0xfc)
|
||||
{
|
||||
charlen = 5;
|
||||
wc &= 0x03;
|
||||
}
|
||||
else
|
||||
{
|
||||
charlen = 6;
|
||||
wc &= 0x01;
|
||||
}
|
||||
|
||||
for (j = 1; j < charlen; j++)
|
||||
{
|
||||
wc <<= 6;
|
||||
wc |= ((unsigned char *)p)[j] & 0x3f;
|
||||
}
|
||||
|
||||
result[i] = wc;
|
||||
p += charlen;
|
||||
}
|
||||
}
|
||||
result[i] = 0;
|
||||
|
||||
if (items_written)
|
||||
*items_written = i;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* g_utf8_to_ucs4:
|
||||
* @str: a UTF-8 encoded string
|
||||
* @len: the maximum length of @str to use. If < 0, then
|
||||
* the string is %NULL terminated.
|
||||
* @items_read: location to store number of bytes read, or %NULL.
|
||||
* If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
|
||||
* returned in case @str contains a trailing partial
|
||||
* character. If an error occurs then the index of the
|
||||
* invalid input is stored here.
|
||||
* @items_written: location to store number of characters written or %NULL.
|
||||
* The value here stored does not include the trailing 0
|
||||
* character.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError other than
|
||||
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
|
||||
*
|
||||
* Convert a string from UTF-8 to a 32-bit fixed width
|
||||
* representation as UCS-4. A trailing 0 will be added to the
|
||||
* string after the converted text.
|
||||
*
|
||||
* Return value: a pointer to a newly allocated UCS-4 string.
|
||||
* This value must be freed with g_free(). If an
|
||||
* error occurs, %NULL will be returned and
|
||||
* @error set.
|
||||
**/
|
||||
gunichar *
|
||||
g_utf8_to_ucs4 (const gchar *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error)
|
||||
{
|
||||
gunichar *result = NULL;
|
||||
gint n_chars, i;
|
||||
const gchar *in;
|
||||
|
||||
in = str;
|
||||
n_chars = 0;
|
||||
while ((len < 0 || str + len - in > 0) && *in)
|
||||
{
|
||||
gunichar wc = g_utf8_get_char_extended (in, str + len - in);
|
||||
if (wc & 0x80000000)
|
||||
{
|
||||
if (wc == (gunichar)-2)
|
||||
{
|
||||
if (items_read)
|
||||
break;
|
||||
else
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
|
||||
_("Partial character sequence at end of input"));
|
||||
}
|
||||
else
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Invalid byte sequence in conversion input"));
|
||||
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
n_chars++;
|
||||
|
||||
in = g_utf8_next_char (in);
|
||||
}
|
||||
|
||||
result = g_new (gunichar, n_chars + 1);
|
||||
|
||||
in = str;
|
||||
for (i=0; i < n_chars; i++)
|
||||
{
|
||||
result[i] = g_utf8_get_char (in);
|
||||
in = g_utf8_next_char (in);
|
||||
}
|
||||
result[i] = 0;
|
||||
|
||||
if (items_written)
|
||||
*items_written = n_chars;
|
||||
|
||||
err_out:
|
||||
if (items_read)
|
||||
*items_read = in - str;
|
||||
|
||||
return result;
|
||||
}
|
||||
@ -507,35 +763,569 @@ g_utf8_to_ucs4 (const char *str, int len)
|
||||
/**
|
||||
* g_ucs4_to_utf8:
|
||||
* @str: a UCS-4 encoded string
|
||||
* @len: the length of @
|
||||
*
|
||||
* @len: the maximum length of @str to use. If < 0, then
|
||||
* the string is %NULL terminated.
|
||||
* @items_read: location to store number of characters read read, or %NULL.
|
||||
* @items_written: location to store number of bytes written or %NULL.
|
||||
* The value here stored does not include the trailing 0
|
||||
* byte.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError other than
|
||||
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
|
||||
*
|
||||
* Convert a string from a 32-bit fixed width representation as UCS-4.
|
||||
* to UTF-8.
|
||||
* to UTF-8. The result will be terminated with a 0 byte.
|
||||
*
|
||||
* Return value: a pointer to a newly allocated UTF-8 string.
|
||||
* This value must be freed with g_free()
|
||||
* This value must be freed with g_free(). If an
|
||||
* error occurs, %NULL will be returned and
|
||||
* @error set.
|
||||
**/
|
||||
gchar *
|
||||
g_ucs4_to_utf8 (const gunichar *str, int len)
|
||||
g_ucs4_to_utf8 (const gunichar *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error)
|
||||
{
|
||||
gint result_length;
|
||||
gchar *result, *p;
|
||||
gchar *result = NULL;
|
||||
gchar *p;
|
||||
gint i;
|
||||
|
||||
result_length = 0;
|
||||
for (i = 0; i < len ; i++)
|
||||
result_length += g_unichar_to_utf8 (str[i], NULL);
|
||||
for (i = 0; len < 0 || i < len ; i++)
|
||||
{
|
||||
if (!str[i])
|
||||
break;
|
||||
|
||||
result_length++;
|
||||
if (str[i] >= 0x80000000)
|
||||
{
|
||||
if (items_read)
|
||||
*items_read = i;
|
||||
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Character out of range for UTF-8"));
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
result_length += UTF8_LENGTH (str[i]);
|
||||
}
|
||||
|
||||
result = g_malloc (result_length + 1);
|
||||
p = result;
|
||||
|
||||
for (i = 0; i < len ; i++)
|
||||
p += g_unichar_to_utf8 (str[i], p);
|
||||
i = 0;
|
||||
while (p < result + result_length)
|
||||
p += g_unichar_to_utf8 (str[i++], p);
|
||||
|
||||
*p = '\0';
|
||||
|
||||
if (items_written)
|
||||
*items_written = p - result;
|
||||
|
||||
err_out:
|
||||
if (items_read)
|
||||
*items_read = i;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
|
||||
|
||||
/**
|
||||
* g_utf16_to_utf8:
|
||||
* @str: a UTF-16 encoded string
|
||||
* @len: the maximum length of @str to use. If < 0, then
|
||||
* the string is terminated with a 0 character.
|
||||
* @items_read: location to store number of words read, or %NULL.
|
||||
* If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
|
||||
* returned in case @str contains a trailing partial
|
||||
* character. If an error occurs then the index of the
|
||||
* invalid input is stored here.
|
||||
* @items_written: location to store number of bytes written, or %NULL.
|
||||
* The value stored here does not include the trailing
|
||||
* 0 byte.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError other than
|
||||
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
|
||||
*
|
||||
* Convert a string from UTF-16 to UTF-8. The result will be
|
||||
* terminated with a 0 byte.
|
||||
*
|
||||
* Return value: a pointer to a newly allocated UTF-8 string.
|
||||
* This value must be freed with g_free(). If an
|
||||
* error occurs, %NULL will be returned and
|
||||
* @error set.
|
||||
**/
|
||||
gchar *
|
||||
g_utf16_to_utf8 (const gunichar2 *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error)
|
||||
{
|
||||
/* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ
|
||||
* are marked.
|
||||
*/
|
||||
const gunichar2 *in;
|
||||
gchar *out;
|
||||
gchar *result = NULL;
|
||||
gint n_bytes;
|
||||
gunichar high_surrogate;
|
||||
|
||||
g_return_val_if_fail (str != 0, NULL);
|
||||
|
||||
n_bytes = 0;
|
||||
in = str;
|
||||
high_surrogate = 0;
|
||||
while ((len < 0 || in - str < len) && *in)
|
||||
{
|
||||
gunichar2 c = *in;
|
||||
gunichar wc;
|
||||
|
||||
if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
|
||||
{
|
||||
if (high_surrogate)
|
||||
{
|
||||
wc = SURROGATE_VALUE (high_surrogate, c);
|
||||
high_surrogate = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Invalid sequence in conversion input"));
|
||||
goto err_out;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (high_surrogate)
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Invalid sequence in conversion input"));
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
|
||||
{
|
||||
high_surrogate = c;
|
||||
goto next1;
|
||||
}
|
||||
else
|
||||
wc = c;
|
||||
}
|
||||
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
n_bytes += UTF8_LENGTH (wc);
|
||||
|
||||
next1:
|
||||
in++;
|
||||
}
|
||||
|
||||
if (high_surrogate && !items_read)
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
|
||||
_("Partial character sequence at end of input"));
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
/* At this point, everything is valid, and we just need to convert
|
||||
*/
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
result = g_malloc (n_bytes + 1);
|
||||
|
||||
high_surrogate = 0;
|
||||
out = result;
|
||||
in = str;
|
||||
while (out < result + n_bytes)
|
||||
{
|
||||
gunichar2 c = *in;
|
||||
gunichar wc;
|
||||
|
||||
if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
|
||||
{
|
||||
wc = SURROGATE_VALUE (high_surrogate, c);
|
||||
high_surrogate = 0;
|
||||
}
|
||||
else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
|
||||
{
|
||||
high_surrogate = c;
|
||||
goto next2;
|
||||
}
|
||||
else
|
||||
wc = c;
|
||||
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
out += g_unichar_to_utf8 (wc, out);
|
||||
|
||||
next2:
|
||||
in++;
|
||||
}
|
||||
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
*out = '\0';
|
||||
|
||||
if (items_written)
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
*items_written = out - result;
|
||||
|
||||
err_out:
|
||||
if (items_read)
|
||||
*items_read = in - str;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* g_utf16_to_ucs4:
|
||||
* @str: a UTF-16 encoded string
|
||||
* @len: the maximum length of @str to use. If < 0, then
|
||||
* the string is terminated with a 0 character.
|
||||
* @items_read: location to store number of words read, or %NULL.
|
||||
* If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
|
||||
* returned in case @str contains a trailing partial
|
||||
* character. If an error occurs then the index of the
|
||||
* invalid input is stored here.
|
||||
* @items_written: location to store number of characters written, or %NULL.
|
||||
* The value stored here does not include the trailing
|
||||
* 0 character.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError other than
|
||||
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
|
||||
*
|
||||
* Convert a string from UTF-16 to UCS-4. The result will be
|
||||
* terminated with a 0 character.
|
||||
*
|
||||
* Return value: a pointer to a newly allocated UCS-4 string.
|
||||
* This value must be freed with g_free(). If an
|
||||
* error occurs, %NULL will be returned and
|
||||
* @error set.
|
||||
**/
|
||||
gunichar *
|
||||
g_utf16_to_ucs4 (const gunichar2 *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error)
|
||||
{
|
||||
const gunichar2 *in;
|
||||
gchar *out;
|
||||
gchar *result = NULL;
|
||||
gint n_bytes;
|
||||
gunichar high_surrogate;
|
||||
|
||||
g_return_val_if_fail (str != 0, NULL);
|
||||
|
||||
n_bytes = 0;
|
||||
in = str;
|
||||
high_surrogate = 0;
|
||||
while ((len < 0 || in - str < len) && *in)
|
||||
{
|
||||
gunichar2 c = *in;
|
||||
gunichar wc;
|
||||
|
||||
if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
|
||||
{
|
||||
if (high_surrogate)
|
||||
{
|
||||
wc = SURROGATE_VALUE (high_surrogate, c);
|
||||
high_surrogate = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Invalid sequence in conversion input"));
|
||||
goto err_out;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (high_surrogate)
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Invalid sequence in conversion input"));
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
|
||||
{
|
||||
high_surrogate = c;
|
||||
goto next1;
|
||||
}
|
||||
else
|
||||
wc = c;
|
||||
}
|
||||
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
n_bytes += sizeof (gunichar);
|
||||
|
||||
next1:
|
||||
in++;
|
||||
}
|
||||
|
||||
if (high_surrogate && !items_read)
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
|
||||
_("Partial character sequence at end of input"));
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
/* At this point, everything is valid, and we just need to convert
|
||||
*/
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
result = g_malloc (n_bytes + 4);
|
||||
|
||||
high_surrogate = 0;
|
||||
out = result;
|
||||
in = str;
|
||||
while (out < result + n_bytes)
|
||||
{
|
||||
gunichar2 c = *in;
|
||||
gunichar wc;
|
||||
|
||||
if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
|
||||
{
|
||||
wc = SURROGATE_VALUE (high_surrogate, c);
|
||||
high_surrogate = 0;
|
||||
}
|
||||
else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
|
||||
{
|
||||
high_surrogate = c;
|
||||
goto next2;
|
||||
}
|
||||
else
|
||||
wc = c;
|
||||
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
*(gunichar *)out = wc;
|
||||
out += sizeof (gunichar);
|
||||
|
||||
next2:
|
||||
in++;
|
||||
}
|
||||
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
*(gunichar *)out = 0;
|
||||
|
||||
if (items_written)
|
||||
/********** DIFFERENT for UTF8/UCS4 **********/
|
||||
*items_written = (out - result) / sizeof (gunichar);
|
||||
|
||||
err_out:
|
||||
if (items_read)
|
||||
*items_read = in - str;
|
||||
|
||||
return (gunichar *)result;
|
||||
}
|
||||
|
||||
/**
|
||||
* g_utf8_to_utf16:
|
||||
* @str: a UTF-8 encoded string
|
||||
* @len: the maximum length of @str to use. If < 0, then
|
||||
* the string is %NULL terminated.
|
||||
|
||||
* @items_read: location to store number of bytes read, or %NULL.
|
||||
* If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
|
||||
* returned in case @str contains a trailing partial
|
||||
* character. If an error occurs then the index of the
|
||||
* invalid input is stored here.
|
||||
* @items_written: location to store number of words written, or %NULL.
|
||||
* The value stored here does not include the trailing
|
||||
* 0 word.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError other than
|
||||
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
|
||||
*
|
||||
* Convert a string from UTF-8 to UTF-16. A 0 word will be
|
||||
* added to the result after the converted text.
|
||||
*
|
||||
* Return value: a pointer to a newly allocated UTF-16 string.
|
||||
* This value must be freed with g_free(). If an
|
||||
* error occurs, %NULL will be returned and
|
||||
* @error set.
|
||||
**/
|
||||
gunichar2 *
|
||||
g_utf8_to_utf16 (const gchar *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error)
|
||||
{
|
||||
gunichar2 *result = NULL;
|
||||
gint n16;
|
||||
const gchar *in;
|
||||
gint i;
|
||||
|
||||
g_return_val_if_fail (str != NULL, NULL);
|
||||
|
||||
in = str;
|
||||
n16 = 0;
|
||||
while ((len < 0 || str + len - in > 0) && *in)
|
||||
{
|
||||
gunichar wc = g_utf8_get_char_extended (in, str + len - in);
|
||||
if (wc & 0x80000000)
|
||||
{
|
||||
if (wc == (gunichar)-2)
|
||||
{
|
||||
if (items_read)
|
||||
break;
|
||||
else
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
|
||||
_("Partial character sequence at end of input"));
|
||||
}
|
||||
else
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Invalid byte sequence in conversion input"));
|
||||
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
if (wc < 0xd800)
|
||||
n16 += 1;
|
||||
else if (wc < 0xe000)
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Invalid sequence in conversion input"));
|
||||
|
||||
goto err_out;
|
||||
}
|
||||
else if (wc < 0x10000)
|
||||
n16 += 1;
|
||||
else if (wc < 0x110000)
|
||||
n16 += 2;
|
||||
else
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Character out of range for UTF-16"));
|
||||
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
in = g_utf8_next_char (in);
|
||||
}
|
||||
|
||||
result = g_new (gunichar2, n16 + 1);
|
||||
|
||||
in = str;
|
||||
for (i = 0; i < n16;)
|
||||
{
|
||||
gunichar wc = g_utf8_get_char (in);
|
||||
|
||||
if (wc < 0x10000)
|
||||
{
|
||||
result[i++] = wc;
|
||||
}
|
||||
else
|
||||
{
|
||||
result[i++] = (wc - 0x10000) / 0x400 + 0xd800;
|
||||
result[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
|
||||
}
|
||||
|
||||
in = g_utf8_next_char (in);
|
||||
}
|
||||
|
||||
result[i] = 0;
|
||||
|
||||
if (items_written)
|
||||
*items_written = n16;
|
||||
|
||||
err_out:
|
||||
if (items_read)
|
||||
*items_read = in - str;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* g_ucs4_to_utf16:
|
||||
* @str: a UCS-4 encoded string
|
||||
* @len: the maximum length of @str to use. If < 0, then
|
||||
* the string is terminated with a zero character.
|
||||
* @items_read: location to store number of bytes read, or %NULL.
|
||||
* If an error occurs then the index of the invalid input
|
||||
* is stored here.
|
||||
* @items_written: location to store number of words written, or %NULL.
|
||||
* The value stored here does not include the trailing
|
||||
* 0 word.
|
||||
* @error: location to store the error occuring, or %NULL to ignore
|
||||
* errors. Any of the errors in #GConvertError other than
|
||||
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
|
||||
*
|
||||
* Convert a string from UCS-4 to UTF-16. A 0 word will be
|
||||
* added to the result after the converted text.
|
||||
*
|
||||
* Return value: a pointer to a newly allocated UTF-16 string.
|
||||
* This value must be freed with g_free(). If an
|
||||
* error occurs, %NULL will be returned and
|
||||
* @error set.
|
||||
**/
|
||||
gunichar2 *
|
||||
g_ucs4_to_utf16 (const gunichar *str,
|
||||
gint len,
|
||||
gint *items_read,
|
||||
gint *items_written,
|
||||
GError **error)
|
||||
{
|
||||
gunichar2 *result = NULL;
|
||||
gint n16;
|
||||
gint i, j;
|
||||
|
||||
n16 = 0;
|
||||
i = 0;
|
||||
while ((len < 0 || i < len) && str[i])
|
||||
{
|
||||
gunichar wc = str[i];
|
||||
|
||||
if (wc < 0xd800)
|
||||
n16 += 1;
|
||||
else if (wc < 0xe000)
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Invalid sequence in conversion input"));
|
||||
|
||||
goto err_out;
|
||||
}
|
||||
else if (wc < 0x10000)
|
||||
n16 += 1;
|
||||
else if (wc < 0x110000)
|
||||
n16 += 2;
|
||||
else
|
||||
{
|
||||
g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
||||
_("Character out of range for UTF-16"));
|
||||
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
result = g_new (gunichar2, n16 + 1);
|
||||
|
||||
for (i = 0, j = 0; j < n16; i++)
|
||||
{
|
||||
gunichar wc = str[i];
|
||||
|
||||
if (wc < 0x10000)
|
||||
{
|
||||
result[j++] = wc;
|
||||
}
|
||||
else
|
||||
{
|
||||
result[j++] = (wc - 0x10000) / 0x400 + 0xd800;
|
||||
result[j++] = (wc - 0x10000) % 0x400 + 0xdc00;
|
||||
}
|
||||
}
|
||||
result[j] = 0;
|
||||
|
||||
if (items_written)
|
||||
*items_written = n16;
|
||||
|
||||
err_out:
|
||||
if (items_read)
|
||||
*items_read = i;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -567,6 +1357,8 @@ g_utf8_validate (const gchar *str,
|
||||
{
|
||||
|
||||
const gchar *p;
|
||||
|
||||
g_return_val_if_fail (str != NULL, FALSE);
|
||||
|
||||
if (end)
|
||||
*end = str;
|
||||
@ -591,8 +1383,14 @@ g_utf8_validate (const gchar *str,
|
||||
|
||||
UTF8_GET (result, p, i, mask, len);
|
||||
|
||||
if (UTF8_LENGTH (result) != len) /* Check for overlong UTF-8 */
|
||||
break;
|
||||
|
||||
if (result == (gunichar)-1)
|
||||
break;
|
||||
|
||||
if (!UNICODE_VALID (result))
|
||||
break;
|
||||
|
||||
p += len;
|
||||
}
|
||||
|
@ -33,7 +33,8 @@ test_programs = \
|
||||
thread-test \
|
||||
threadpool-test \
|
||||
tree-test \
|
||||
type-test
|
||||
type-test \
|
||||
unicode-encoding
|
||||
|
||||
test_scripts = run-markup-tests.sh
|
||||
|
||||
@ -71,6 +72,7 @@ thread_test_LDADD = $(thread_LDADD)
|
||||
threadpool_test_LDADD = $(thread_LDADD)
|
||||
tree_test_LDADD = $(progs_LDADD)
|
||||
type_test_LDADD = $(progs_LDADD)
|
||||
unicode_encoding_LDADD = $(progs_LDADD)
|
||||
|
||||
lib_LTLIBRARIES = libmoduletestplugin_a.la libmoduletestplugin_b.la
|
||||
|
||||
|
@ -155,7 +155,7 @@ adder_thread (gpointer data)
|
||||
|
||||
g_free (channels);
|
||||
|
||||
g_main_loop_destroy (addr_data.loop);
|
||||
g_main_loop_unref (addr_data.loop);
|
||||
|
||||
g_print ("Timeout run %d times\n", addr_data.count);
|
||||
|
||||
@ -393,7 +393,7 @@ main (int argc,
|
||||
g_timeout_add (RECURSER_TIMEOUT, recurser_start, NULL);
|
||||
|
||||
g_main_loop_run (main_loop);
|
||||
g_main_loop_destroy (main_loop);
|
||||
g_main_loop_unref (main_loop);
|
||||
|
||||
#endif
|
||||
return 0;
|
||||
|
411
tests/unicode-encoding.c
Normal file
411
tests/unicode-encoding.c
Normal file
@ -0,0 +1,411 @@
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <glib.h>
|
||||
|
||||
static gint exit_status = 0;
|
||||
|
||||
void
|
||||
croak (char *format, ...)
|
||||
{
|
||||
va_list va;
|
||||
|
||||
va_start (va, format);
|
||||
vfprintf (stderr, format, va);
|
||||
va_end (va);
|
||||
|
||||
exit (1);
|
||||
}
|
||||
|
||||
void
|
||||
fail (char *format, ...)
|
||||
{
|
||||
va_list va;
|
||||
|
||||
va_start (va, format);
|
||||
vfprintf (stderr, format, va);
|
||||
va_end (va);
|
||||
|
||||
exit_status |= 1;
|
||||
}
|
||||
|
||||
typedef enum
|
||||
{
|
||||
VALID,
|
||||
INCOMPLETE,
|
||||
NOTUNICODE,
|
||||
OVERLONG,
|
||||
MALFORMED
|
||||
} Status;
|
||||
|
||||
static gboolean
|
||||
ucs4_equal (gunichar *a, gunichar *b)
|
||||
{
|
||||
while (*a && *b && (*a == *b))
|
||||
{
|
||||
a++;
|
||||
b++;
|
||||
}
|
||||
|
||||
return (*a == *b);
|
||||
}
|
||||
|
||||
static gboolean
|
||||
utf16_equal (gunichar2 *a, gunichar2 *b)
|
||||
{
|
||||
while (*a && *b && (*a == *b))
|
||||
{
|
||||
a++;
|
||||
b++;
|
||||
}
|
||||
|
||||
return (*a == *b);
|
||||
}
|
||||
|
||||
static gint
|
||||
utf16_count (gunichar2 *a)
|
||||
{
|
||||
gint result = 0;
|
||||
|
||||
while (a[result])
|
||||
result++;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static void
|
||||
process (gint line,
|
||||
gchar *utf8,
|
||||
Status status,
|
||||
gunichar *ucs4,
|
||||
gint ucs4_len)
|
||||
{
|
||||
const gchar *end;
|
||||
gboolean is_valid = g_utf8_validate (utf8, -1, &end);
|
||||
GError *error = NULL;
|
||||
gint items_read, items_written;
|
||||
|
||||
switch (status)
|
||||
{
|
||||
case VALID:
|
||||
if (!is_valid)
|
||||
{
|
||||
fail ("line %d: valid but g_utf8_validate returned FALSE\n", line);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
case NOTUNICODE:
|
||||
case INCOMPLETE:
|
||||
case OVERLONG:
|
||||
case MALFORMED:
|
||||
if (is_valid)
|
||||
{
|
||||
fail ("line %d: invalid but g_utf8_validate returned TRUE\n", line);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (status == INCOMPLETE)
|
||||
{
|
||||
gunichar *ucs4_result;
|
||||
|
||||
ucs4_result = g_utf8_to_ucs4 (utf8, -1, NULL, NULL, &error);
|
||||
|
||||
if (!error || !g_error_matches (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT))
|
||||
{
|
||||
fail ("line %d: incomplete input not properly detected\n", line);
|
||||
return;
|
||||
}
|
||||
g_clear_error (&error);
|
||||
|
||||
ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, NULL, &error);
|
||||
|
||||
if (!ucs4_result || items_read == strlen (utf8))
|
||||
{
|
||||
fail ("line %d: incomplete input not properly detected\n", line);
|
||||
return;
|
||||
}
|
||||
|
||||
g_free (ucs4_result);
|
||||
}
|
||||
|
||||
if (status == VALID || status == NOTUNICODE)
|
||||
{
|
||||
gunichar *ucs4_result;
|
||||
gchar *utf8_result;
|
||||
|
||||
ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, &items_written, &error);
|
||||
if (!ucs4_result)
|
||||
{
|
||||
fail ("line %d: conversion to ucs4 failed: %s\n", line, error->message);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!ucs4_equal (ucs4_result, ucs4) ||
|
||||
items_read != strlen (utf8) ||
|
||||
items_written != ucs4_len)
|
||||
{
|
||||
fail ("line %d: results of conversion to ucs4 do not match expected.\n", line);
|
||||
return;
|
||||
}
|
||||
|
||||
g_free (ucs4_result);
|
||||
|
||||
ucs4_result = g_utf8_to_ucs4_fast (utf8, -1, &items_written);
|
||||
|
||||
if (!ucs4_equal (ucs4_result, ucs4) ||
|
||||
items_written != ucs4_len)
|
||||
{
|
||||
fail ("line %d: results of conversion to ucs4 do not match expected.\n", line);
|
||||
return;
|
||||
}
|
||||
|
||||
utf8_result = g_ucs4_to_utf8 (ucs4_result, -1, &items_read, &items_written, &error);
|
||||
if (!utf8_result)
|
||||
{
|
||||
fail ("line %d: conversion back to utf8 failed: %s", line, error->message);
|
||||
return;
|
||||
}
|
||||
|
||||
if (strcmp (utf8_result, utf8) != 0 ||
|
||||
items_read != ucs4_len ||
|
||||
items_written != strlen (utf8))
|
||||
{
|
||||
fail ("line %d: conversion back to utf8 did not match original\n", line);
|
||||
return;
|
||||
}
|
||||
|
||||
g_free (utf8_result);
|
||||
g_free (ucs4_result);
|
||||
}
|
||||
|
||||
if (status == VALID)
|
||||
{
|
||||
gunichar2 *utf16_expected_tmp;
|
||||
gunichar2 *utf16_expected;
|
||||
gunichar2 *utf16_from_utf8;
|
||||
gunichar2 *utf16_from_ucs4;
|
||||
gunichar *ucs4_result;
|
||||
gint bytes_written;
|
||||
gint n_chars;
|
||||
gchar *utf8_result;
|
||||
|
||||
if (!(utf16_expected_tmp = (gunichar2 *)g_convert (utf8, -1, "UTF-16", "UTF-8",
|
||||
NULL, &bytes_written, NULL)))
|
||||
{
|
||||
fail ("line %d: could not convert to UTF-16 via g_convert\n", line);
|
||||
return;
|
||||
}
|
||||
|
||||
/* zero-terminate and remove BOM
|
||||
*/
|
||||
n_chars = bytes_written / 2;
|
||||
if (utf16_expected_tmp[0] == 0xfeff) /* BOM */
|
||||
{
|
||||
n_chars--;
|
||||
utf16_expected = g_new (gunichar2, n_chars + 1);
|
||||
memcpy (utf16_expected, utf16_expected_tmp + 1, sizeof(gunichar2) * n_chars);
|
||||
}
|
||||
else if (utf16_expected_tmp[0] == 0xfffe) /* ANTI-BOM */
|
||||
{
|
||||
fail ("line %d: conversion via iconv to \"UTF-16\" is not native-endian\n");
|
||||
return;
|
||||
}
|
||||
else
|
||||
{
|
||||
utf16_expected = g_new (gunichar2, n_chars + 1);
|
||||
memcpy (utf16_expected, utf16_expected_tmp, sizeof(gunichar2) * n_chars);
|
||||
}
|
||||
|
||||
utf16_expected[n_chars] = '\0';
|
||||
|
||||
if (!(utf16_from_utf8 = g_utf8_to_utf16 (utf8, -1, &items_read, &items_written, &error)))
|
||||
{
|
||||
fail ("line %d: conversion to ucs16 failed: %s\n", line, error->message);
|
||||
return;
|
||||
}
|
||||
|
||||
if (items_read != strlen (utf8) ||
|
||||
utf16_count (utf16_from_utf8) != items_written)
|
||||
{
|
||||
fail ("line %d: length error in conversion to ucs16\n", line);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!(utf16_from_ucs4 = g_ucs4_to_utf16 (ucs4, -1, &items_read, &items_written, &error)))
|
||||
{
|
||||
fail ("line %d: conversion to ucs16 failed: %s\n", line, error->message);
|
||||
return;
|
||||
}
|
||||
|
||||
if (items_read != ucs4_len ||
|
||||
utf16_count (utf16_from_ucs4) != items_written)
|
||||
{
|
||||
fail ("line %d: length error in conversion to ucs16\n", line);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!utf16_equal (utf16_from_utf8, utf16_expected) ||
|
||||
!utf16_equal (utf16_from_ucs4, utf16_expected))
|
||||
{
|
||||
fail ("line %d: results of conversion to ucs16 do not match\n", line);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!(utf8_result = g_utf16_to_utf8 (utf16_from_utf8, -1, &items_read, &items_written, &error)))
|
||||
{
|
||||
fail ("line %d: conversion back to utf8 failed: %s\n", line, error->message);
|
||||
return;
|
||||
}
|
||||
|
||||
if (items_read != utf16_count (utf16_from_utf8) ||
|
||||
items_written != strlen (utf8))
|
||||
{
|
||||
fail ("line %d: length error in conversion from ucs16 to utf8\n", line);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!(ucs4_result = g_utf16_to_ucs4 (utf16_from_ucs4, -1, &items_read, &items_written, &error)))
|
||||
{
|
||||
fail ("line %d: conversion back to utf8/ucs4 failed\n", line);
|
||||
return;
|
||||
}
|
||||
|
||||
if (items_read != utf16_count (utf16_from_utf8) ||
|
||||
items_written != ucs4_len)
|
||||
{
|
||||
fail ("line %d: length error in conversion from ucs16 to ucs4\n", line);
|
||||
return;
|
||||
}
|
||||
|
||||
if (strcmp (utf8, utf8_result) != 0 ||
|
||||
!ucs4_equal (ucs4, ucs4_result))
|
||||
{
|
||||
fail ("line %d: conversion back to utf8/ucs4 did not match original\n", line);
|
||||
return;
|
||||
}
|
||||
|
||||
g_free (utf16_expected_tmp);
|
||||
g_free (utf16_expected);
|
||||
g_free (utf16_from_utf8);
|
||||
g_free (utf16_from_ucs4);
|
||||
g_free (utf8_result);
|
||||
g_free (ucs4_result);
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main (int argc, char **argv)
|
||||
{
|
||||
gchar *srcdir = getenv ("srcdir");
|
||||
gchar *testfile;
|
||||
gchar *contents;
|
||||
GError *error = NULL;
|
||||
gchar *p, *end;
|
||||
char *tmp;
|
||||
gint state = 0;
|
||||
gint line = 1;
|
||||
gint start_line = 0; /* Quiet GCC */
|
||||
gchar *utf8 = NULL; /* Quiet GCC */
|
||||
GArray *ucs4;
|
||||
Status status = VALID; /* Quiet GCC */
|
||||
|
||||
if (!srcdir)
|
||||
srcdir = ".";
|
||||
|
||||
testfile = g_strconcat (srcdir, "/", "utf8.txt", NULL);
|
||||
|
||||
g_file_get_contents (testfile, &contents, NULL, &error);
|
||||
if (error)
|
||||
croak ("Cannot open utf8.txt: %s", error->message);
|
||||
|
||||
ucs4 = g_array_new (TRUE, FALSE, sizeof(gunichar));
|
||||
|
||||
p = contents;
|
||||
|
||||
/* Loop over lines */
|
||||
while (*p)
|
||||
{
|
||||
while (*p && (*p == ' ' || *p == '\t'))
|
||||
p++;
|
||||
|
||||
end = p;
|
||||
while (*end && *end != '\n')
|
||||
end++;
|
||||
|
||||
if (!*p || *p == '#' || *p == '\n')
|
||||
goto next_line;
|
||||
|
||||
tmp = g_strstrip (g_strndup (p, end - p));
|
||||
|
||||
switch (state)
|
||||
{
|
||||
case 0:
|
||||
/* UTF-8 string */
|
||||
start_line = line;
|
||||
utf8 = tmp;
|
||||
tmp = NULL;
|
||||
break;
|
||||
|
||||
case 1:
|
||||
/* Status */
|
||||
if (!strcmp (tmp, "VALID"))
|
||||
status = VALID;
|
||||
else if (!strcmp (tmp, "INCOMPLETE"))
|
||||
status = INCOMPLETE;
|
||||
else if (!strcmp (tmp, "NOTUNICODE"))
|
||||
status = NOTUNICODE;
|
||||
else if (!strcmp (tmp, "OVERLONG"))
|
||||
status = OVERLONG;
|
||||
else if (!strcmp (tmp, "MALFORMED"))
|
||||
status = MALFORMED;
|
||||
else
|
||||
croak ("Invalid status on line %d\n", line);
|
||||
|
||||
if (status != VALID && status != NOTUNICODE)
|
||||
state++; /* No UCS-4 data */
|
||||
|
||||
break;
|
||||
|
||||
case 2:
|
||||
/* UCS-4 version */
|
||||
|
||||
p = strtok (tmp, " \t");
|
||||
while (p)
|
||||
{
|
||||
gchar *endptr;
|
||||
|
||||
gunichar ch = strtoul (p, &endptr, 16);
|
||||
if (*endptr != '\0')
|
||||
croak ("Invalid UCS-4 character on line %d\n", line);
|
||||
|
||||
g_array_append_val (ucs4, ch);
|
||||
|
||||
p = strtok (NULL, " \t");
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
g_free (tmp);
|
||||
state = (state + 1) % 3;
|
||||
|
||||
if (state == 0)
|
||||
{
|
||||
process (start_line, utf8, status, (gunichar *)ucs4->data, ucs4->len);
|
||||
g_array_set_size (ucs4, 0);
|
||||
g_free (utf8);
|
||||
}
|
||||
|
||||
next_line:
|
||||
p = end;
|
||||
if (*p && *p == '\n')
|
||||
p++;
|
||||
|
||||
line++;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
297
tests/utf8.txt
Normal file
297
tests/utf8.txt
Normal file
@ -0,0 +1,297 @@
|
||||
# This file is derived from
|
||||
#
|
||||
# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
||||
#
|
||||
# Which was created by Markus Kuhn <mkuhn@acm.org> - 2000-09-02
|
||||
#
|
||||
# lines begining with # and blank lines are ignored
|
||||
#
|
||||
# Beyond that, this file consists of a series of test cases. Each test case consists of
|
||||
# 2 or 3 lines:
|
||||
#
|
||||
# 1. A UTF-8 string
|
||||
# 2. A status
|
||||
# VALID : The string is a valid UTF-8 representation of valid Unicode
|
||||
# INCOMPLETE : The string has a partial character at the end
|
||||
# NOTUNICODE : The string is valid UTF-8, but the characters represented
|
||||
# are not valid unicode (
|
||||
# OVERLONG : The string includes overlong sequences
|
||||
# MALFORMED : The string is not valid UTF-8
|
||||
# 3. If the status is VALID or NOTUNICODE, the UCS-4 representation of the string,
|
||||
# as a series of hex numbers.
|
||||
|
||||
# 1 Some correct UTF-8 text
|
||||
κόσμε
|
||||
VALID
|
||||
03ba 1f79 03c3 03bc 03b5
|
||||
|
||||
# 2.1 First possible sequence of a certain length
|
||||
#
|
||||
# FIXME - handle NULLS?
|
||||
#
|
||||
# [ NULL BYTE ]
|
||||
#VALID
|
||||
#0000
|
||||
|
||||
€
|
||||
VALID
|
||||
0080
|
||||
|
||||
à €
|
||||
VALID
|
||||
0800
|
||||
|
||||
ð<EFBFBD>€€
|
||||
VALID
|
||||
00010000
|
||||
|
||||
øˆ€€€
|
||||
NOTUNICODE
|
||||
00200000
|
||||
|
||||
ü„€€€€
|
||||
NOTUNICODE
|
||||
04000000
|
||||
|
||||
|
||||
VALID
|
||||
0000007f
|
||||
|
||||
ß¿
|
||||
VALID
|
||||
000007ff
|
||||
|
||||
ï¿¿
|
||||
NOTUNICODE
|
||||
0000ffff
|
||||
|
||||
÷¿¿¿
|
||||
NOTUNICODE
|
||||
001fffff
|
||||
|
||||
û¿¿¿¿
|
||||
NOTUNICODE
|
||||
03ffffff
|
||||
|
||||
ý¿¿¿¿¿
|
||||
NOTUNICODE
|
||||
7fffffff
|
||||
|
||||
# 2.3 Other boundary conditions
|
||||
|
||||
퟿
|
||||
VALID
|
||||
d7ff
|
||||
|
||||

|
||||
VALID
|
||||
e000
|
||||
|
||||
�
|
||||
VALID
|
||||
fffd
|
||||
|
||||
ô<EFBFBD>¿¿
|
||||
VALID
|
||||
0010ffff
|
||||
|
||||
ô<EFBFBD>€€
|
||||
NOTUNICODE
|
||||
00110000
|
||||
|
||||
# 3.1 Unexpected continuation bytes
|
||||
|
||||
€
|
||||
MALFORMED
|
||||
¿
|
||||
MALFORMED
|
||||
€¿
|
||||
MALFORMED
|
||||
€¿€
|
||||
MALFORMED
|
||||
€¿€¿
|
||||
MALFORMED
|
||||
€¿€¿€
|
||||
MALFORMED
|
||||
€¿€¿€¿
|
||||
MALFORMED
|
||||
€¿€¿€¿€
|
||||
MALFORMED
|
||||
€<EFBFBD>‚ƒ„…†‡ˆ‰Š‹Œ<EFBFBD>Ž<EFBFBD><EFBFBD>‘’“”•–—˜™š›œ<EFBFBD>žŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿
|
||||
MALFORMED
|
||||
|
||||
# 3.2 Lonely start characters
|
||||
|
||||
À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß
|
||||
MALFORMED
|
||||
à á â ã ä å æ ç è é ê ë ì í î ï
|
||||
MALFORMED
|
||||
ð ñ ò ó ô õ ö ÷
|
||||
MALFORMED
|
||||
ø ù ú û
|
||||
MALFORMED
|
||||
ü ý
|
||||
MALFORMED
|
||||
|
||||
# 3.3 Sequences with last continuation byte missing
|
||||
|
||||
À
|
||||
INCOMPLETE
|
||||
à€
|
||||
INCOMPLETE
|
||||
ð€€
|
||||
INCOMPLETE
|
||||
ø€€€
|
||||
INCOMPLETE
|
||||
ü€€€€
|
||||
INCOMPLETE
|
||||
ß
|
||||
INCOMPLETE
|
||||
ï¿
|
||||
INCOMPLETE
|
||||
÷¿¿
|
||||
INCOMPLETE
|
||||
û¿¿¿
|
||||
INCOMPLETE
|
||||
ý¿¿¿¿
|
||||
INCOMPLETE
|
||||
|
||||
# 3.4 Concatenation of incomplete sequences
|
||||
|
||||
Àà€ð€€ø€€€ü€€€€ßï¿÷¿¿û¿¿¿ý¿¿¿¿
|
||||
MALFORMED
|
||||
|
||||
# 3.5 Impossible bytes
|
||||
|
||||
þ
|
||||
MALFORMED
|
||||
ÿ
|
||||
MALFORMED
|
||||
þþÿÿ
|
||||
MALFORMED
|
||||
|
||||
# Examples of an overlong ASCII character
|
||||
|
||||
À¯
|
||||
OVERLONG
|
||||
à€¯
|
||||
OVERLONG
|
||||
ð€€¯
|
||||
OVERLONG
|
||||
ø€€€¯
|
||||
OVERLONG
|
||||
ü€€€€¯
|
||||
OVERLONG
|
||||
|
||||
# Maximum overlong sequences
|
||||
|
||||
Á¿
|
||||
OVERLONG
|
||||
àŸ¿
|
||||
OVERLONG
|
||||
ð<EFBFBD>¿¿
|
||||
OVERLONG
|
||||
ø‡¿¿¿
|
||||
OVERLONG
|
||||
üƒ¿¿¿¿
|
||||
OVERLONG
|
||||
|
||||
# Overlong representation of the NUL character
|
||||
|
||||
À€
|
||||
OVERLONG
|
||||
à€€
|
||||
OVERLONG
|
||||
ð€€€
|
||||
OVERLONG
|
||||
ø€€€€
|
||||
OVERLONG
|
||||
ü€€€€€
|
||||
OVERLONG
|
||||
|
||||
# Illegal code positions
|
||||
|
||||
# Single UTF-16 surrogates
|
||||
|
||||
í €
|
||||
NOTUNICODE
|
||||
d800
|
||||
|
||||
í¿
|
||||
NOTUNICODE
|
||||
db7f
|
||||
|
||||
í®€
|
||||
NOTUNICODE
|
||||
db80
|
||||
|
||||
í¯¿
|
||||
NOTUNICODE
|
||||
dbff
|
||||
|
||||
í°€
|
||||
NOTUNICODE
|
||||
dc00
|
||||
|
||||
í¾€
|
||||
NOTUNICODE
|
||||
df80
|
||||
|
||||
í¿¿
|
||||
NOTUNICODE
|
||||
dfff
|
||||
|
||||
# Paired UTF-16 surrogates
|
||||
|
||||
í €í°€
|
||||
NOTUNICODE
|
||||
d800 dc00
|
||||
|
||||
í €í¿¿
|
||||
NOTUNICODE
|
||||
d800 dfff
|
||||
|
||||
í¿í°€
|
||||
NOTUNICODE
|
||||
db7f dc00
|
||||
|
||||
í¿í¿¿
|
||||
NOTUNICODE
|
||||
db7f dfff
|
||||
|
||||
󰀀
|
||||
NOTUNICODE
|
||||
db80 dc00
|
||||
|
||||
󰏿
|
||||
NOTUNICODE
|
||||
db80 dfff
|
||||
|
||||
􏰀
|
||||
NOTUNICODE
|
||||
dbff dc00
|
||||
|
||||
􏿿
|
||||
NOTUNICODE
|
||||
dbff dfff
|
||||
|
||||
# Other illegal code positions
|
||||
|
||||
￾
|
||||
NOTUNICODE
|
||||
fffe
|
||||
|
||||
ï¿¿
|
||||
NOTUNICODE
|
||||
ffff
|
||||
|
||||
################
|
||||
#
|
||||
# Some more tests, not from Markus Kuhn's file
|
||||
#
|
||||
|
||||
# Mixed plane 0 and higher planes
|
||||
|
||||
Að<EFBFBD>€€Bô<EFBFBD>¿¿C
|
||||
VALID
|
||||
41 00010000 42 10ffff 43
|
Loading…
x
Reference in New Issue
Block a user