mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2025-09-30 19:06:38 +02:00
Add g_utf8_to_utf16_make_valid() with backtrack functionality
This commit is contained in:
@@ -22,6 +22,7 @@
|
|||||||
#define __G_UNICODE_PRIVATE_H__
|
#define __G_UNICODE_PRIVATE_H__
|
||||||
|
|
||||||
#include "gtypes.h"
|
#include "gtypes.h"
|
||||||
|
#include "gunicode.h"
|
||||||
|
|
||||||
G_BEGIN_DECLS
|
G_BEGIN_DECLS
|
||||||
|
|
||||||
@@ -29,6 +30,17 @@ gunichar *_g_utf8_normalize_wc (const gchar *str,
|
|||||||
gssize max_len,
|
gssize max_len,
|
||||||
GNormalizeMode mode);
|
GNormalizeMode mode);
|
||||||
|
|
||||||
|
void
|
||||||
|
g_utf8_to_utf16_make_valid (const char *utf8,
|
||||||
|
gunichar2 *buffer,
|
||||||
|
size_t buffer_len,
|
||||||
|
gunichar2 **out_utf16,
|
||||||
|
size_t *out_utf16_len);
|
||||||
|
|
||||||
|
size_t
|
||||||
|
g_utf8_to_utf16_make_valid_backtrack (const char *utf8,
|
||||||
|
size_t utf16_len);
|
||||||
|
|
||||||
G_END_DECLS
|
G_END_DECLS
|
||||||
|
|
||||||
#endif /* __G_UNICODE_PRIVATE_H__ */
|
#endif /* __G_UNICODE_PRIVATE_H__ */
|
||||||
|
397
glib/gutf8.c
397
glib/gutf8.c
@@ -27,6 +27,7 @@
|
|||||||
#include <langinfo.h>
|
#include <langinfo.h>
|
||||||
#endif
|
#endif
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
|
||||||
#ifdef G_PLATFORM_WIN32
|
#ifdef G_PLATFORM_WIN32
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
@@ -41,6 +42,7 @@
|
|||||||
#include "gthread.h"
|
#include "gthread.h"
|
||||||
#include "glibintl.h"
|
#include "glibintl.h"
|
||||||
#include "gvalgrind.h"
|
#include "gvalgrind.h"
|
||||||
|
#include "gunicodeprivate.h"
|
||||||
|
|
||||||
#define UTF8_COMPUTE(Char, Mask, Len) \
|
#define UTF8_COMPUTE(Char, Mask, Len) \
|
||||||
if (Char < 128) \
|
if (Char < 128) \
|
||||||
@@ -1574,6 +1576,401 @@ g_ucs4_to_utf16 (const gunichar *str,
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**< private >
|
||||||
|
* find_invalid_or_incomplete_utf8_sequence:
|
||||||
|
*
|
||||||
|
* @string: the source string.
|
||||||
|
*
|
||||||
|
* Returns the first byte of a sequence that is either invalid
|
||||||
|
* UTF-8 or incomplete UTF-8, or a pointer to the NULL terminator
|
||||||
|
* if all of @string is valid UTF-8.
|
||||||
|
*/
|
||||||
|
static const char *
|
||||||
|
find_invalid_or_incomplete_utf8_sequence (const char *string)
|
||||||
|
{
|
||||||
|
const char *end = string;
|
||||||
|
|
||||||
|
g_utf8_validate (string, -1, &end);
|
||||||
|
|
||||||
|
return end;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**< private >
|
||||||
|
* find_valid_and_complete_utf8_sequence:
|
||||||
|
*
|
||||||
|
* @string: a NULL-terminated source string.
|
||||||
|
*
|
||||||
|
* Returns the first byte of a sequence that is valid (and complete)
|
||||||
|
* UTF-8, or a pointer to the NULL terminator if no such sequence
|
||||||
|
* could be found.
|
||||||
|
*/
|
||||||
|
static const char *
|
||||||
|
find_valid_and_complete_utf8_sequence (const char *string)
|
||||||
|
{
|
||||||
|
const unsigned char *iter = (const unsigned char *)string;
|
||||||
|
|
||||||
|
for (;; iter++)
|
||||||
|
{
|
||||||
|
if (*iter < 128 ||
|
||||||
|
((*iter & 0xC0) == 0xC0 &&
|
||||||
|
g_utf8_get_char_validated ((const char*)iter, -1) < (gunichar2)-2))
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return (const char *) iter;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**< private >
|
||||||
|
* invalidly_encoded_string_to_utf16_get_output_length:
|
||||||
|
*
|
||||||
|
* @start: start of the source string.
|
||||||
|
* @end: end of the source string (excluded).
|
||||||
|
*
|
||||||
|
* Returns the output length, as a count of gunichar2, that is necessary
|
||||||
|
* for the generic translation of an invalidly-encoded string to UTF-16.
|
||||||
|
*/
|
||||||
|
static size_t
|
||||||
|
invalidly_encoded_string_to_utf16_get_output_length (const char *start,
|
||||||
|
const char *end)
|
||||||
|
{
|
||||||
|
size_t count;
|
||||||
|
|
||||||
|
g_assert ((uintptr_t)end >= (uintptr_t)start);
|
||||||
|
|
||||||
|
/* We output one gunichar2 for each input byte */
|
||||||
|
count = (uintptr_t)end - (uintptr_t)start;
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**< private >
|
||||||
|
* invalidly_encoded_string_to_utf16:
|
||||||
|
*
|
||||||
|
* @start: start of the string.
|
||||||
|
* @end: end of the string (excluded).
|
||||||
|
* @output: the output buffer. Must be long enough to hold
|
||||||
|
* the entire output.
|
||||||
|
*
|
||||||
|
* Performs a generic conversion of an invalidly-encoded string
|
||||||
|
* to UTF-16. Note: the current implementation simply outputs
|
||||||
|
* Unicode Replacement Characters "<22>" (U+FFFD) for each byte in
|
||||||
|
* the source string.
|
||||||
|
*/
|
||||||
|
static size_t
|
||||||
|
invalidly_encoded_string_to_utf16 (const char *start,
|
||||||
|
const char *end,
|
||||||
|
gunichar2 *output)
|
||||||
|
{
|
||||||
|
size_t count;
|
||||||
|
|
||||||
|
g_assert ((uintptr_t)end >= (uintptr_t)start);
|
||||||
|
count = (uintptr_t)end - (uintptr_t)start;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < count; i++)
|
||||||
|
output[i] = 0xFFFD;
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**< private >
|
||||||
|
* invalidly_encoded_string_to_utf16_backtrack:
|
||||||
|
*
|
||||||
|
* @start: start of the source string.
|
||||||
|
* @output_length: length within the output UTF-16 string
|
||||||
|
* expressed as a count of gunichar2.
|
||||||
|
*
|
||||||
|
* Backtracks an output-length in count of gunichar2 to the
|
||||||
|
* corresponding length, in bytes, of the source string.
|
||||||
|
*/
|
||||||
|
static size_t
|
||||||
|
invalidly_encoded_string_to_utf16_backtrack (const char *start,
|
||||||
|
size_t output_length)
|
||||||
|
{
|
||||||
|
/* The conversion process outputs one gunichar2 (a complete
|
||||||
|
* character) for each input byte, so the mapping is very
|
||||||
|
* simple.
|
||||||
|
*/
|
||||||
|
return output_length;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**< private >
|
||||||
|
* valid_utf8_to_utf16_get_output_length:
|
||||||
|
*
|
||||||
|
* @start: start of the source string. Must be valid UTF-8.
|
||||||
|
* @end: end of the source string (excluded).
|
||||||
|
*
|
||||||
|
* Returns the output-length, in count of gunichar2, necessary for the
|
||||||
|
* translation of a valid UTF-8 string to UTF-16.
|
||||||
|
*/
|
||||||
|
static size_t
|
||||||
|
valid_utf8_to_utf16_get_output_length (const char *start,
|
||||||
|
const char *end)
|
||||||
|
{
|
||||||
|
size_t count = 0;
|
||||||
|
|
||||||
|
while (start < end)
|
||||||
|
{
|
||||||
|
gunichar codepoint = g_utf8_get_char (start);
|
||||||
|
|
||||||
|
if (codepoint <= 0xFFFF)
|
||||||
|
count += 1;
|
||||||
|
else
|
||||||
|
count += 2;
|
||||||
|
|
||||||
|
start = g_utf8_next_char (start);
|
||||||
|
}
|
||||||
|
|
||||||
|
g_assert (start == end);
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**< private >
|
||||||
|
* valid_utf8_to_utf16:
|
||||||
|
*
|
||||||
|
* @start: start of the source string. Must be valid UTF-8
|
||||||
|
* @end: end of the source string (excluded).
|
||||||
|
* @output: the output buffer. Must be long enough to hold
|
||||||
|
* the entire output.
|
||||||
|
*
|
||||||
|
* Performs the conversion of a valid UTF-8 string to UTF-16.
|
||||||
|
*/
|
||||||
|
static size_t
|
||||||
|
valid_utf8_to_utf16 (const char *start,
|
||||||
|
const char *end,
|
||||||
|
gunichar2 *output)
|
||||||
|
{
|
||||||
|
size_t count = 0;
|
||||||
|
|
||||||
|
while (start < end)
|
||||||
|
{
|
||||||
|
gunichar codepoint = g_utf8_get_char (start);
|
||||||
|
|
||||||
|
if (codepoint <= 0xFFFF)
|
||||||
|
{
|
||||||
|
output[count++] = (gunichar2) codepoint;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
gunichar subtract = codepoint - 0x010000;
|
||||||
|
output[count++] = 0xD800 + ((subtract >> 10) & 0x3FF);
|
||||||
|
output[count++] = 0xDC00 + (subtract & 0x3FF);
|
||||||
|
}
|
||||||
|
|
||||||
|
start = g_utf8_next_char (start);
|
||||||
|
}
|
||||||
|
|
||||||
|
g_assert (start == end);
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**< private >
|
||||||
|
* valid_utf8_to_utf16_backtrack:
|
||||||
|
*
|
||||||
|
* @start: start of the source string. Must be valid UTF-8.
|
||||||
|
* @output_length: length within the output UTF-16 string expressed
|
||||||
|
* as a count of gunichar2.
|
||||||
|
*
|
||||||
|
* Backtracks an output-length in count of gunichar2 to the
|
||||||
|
* corresponding length, in bytes, of the source string.
|
||||||
|
*/
|
||||||
|
static size_t
|
||||||
|
valid_utf8_to_utf16_backtrack (const char *start,
|
||||||
|
size_t output_length)
|
||||||
|
{
|
||||||
|
const char *iter = start;
|
||||||
|
size_t count = 0;
|
||||||
|
|
||||||
|
for (; *iter != '\0'; iter = g_utf8_next_char (iter))
|
||||||
|
{
|
||||||
|
if (output_length <= count)
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (g_utf8_get_char (iter) <= 0xFFFF)
|
||||||
|
count += 1;
|
||||||
|
else
|
||||||
|
count += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (uintptr_t)iter - (uintptr_t)start;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static size_t
|
||||||
|
utf8_to_utf16_make_valid_get_output_length (const char *string)
|
||||||
|
{
|
||||||
|
const char *start = string;
|
||||||
|
size_t count = 0;
|
||||||
|
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
const char *end = NULL;
|
||||||
|
|
||||||
|
end = find_invalid_or_incomplete_utf8_sequence (start);
|
||||||
|
count += valid_utf8_to_utf16_get_output_length (start, end);
|
||||||
|
start = end;
|
||||||
|
|
||||||
|
if (start[0] == '\0')
|
||||||
|
break;
|
||||||
|
|
||||||
|
end = find_valid_and_complete_utf8_sequence (start);
|
||||||
|
g_assert ((uintptr_t)end > (uintptr_t)start);
|
||||||
|
count += invalidly_encoded_string_to_utf16_get_output_length (start, end);
|
||||||
|
start = end;
|
||||||
|
|
||||||
|
if (start[0] == '\0')
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t
|
||||||
|
utf8_to_utf16_make_valid_backtrack (const char *string,
|
||||||
|
size_t output_length)
|
||||||
|
{
|
||||||
|
const char *start = string;
|
||||||
|
size_t count = 0;
|
||||||
|
size_t l;
|
||||||
|
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
const char *end = NULL;
|
||||||
|
|
||||||
|
end = find_invalid_or_incomplete_utf8_sequence (start);
|
||||||
|
l = valid_utf8_to_utf16_get_output_length (start, end);
|
||||||
|
if (output_length < count + l)
|
||||||
|
return count + valid_utf8_to_utf16_backtrack (start, output_length);
|
||||||
|
count += (uintptr_t)end - (uintptr_t)start;
|
||||||
|
output_length -= l;
|
||||||
|
start = end;
|
||||||
|
|
||||||
|
if (start[0] == '\0')
|
||||||
|
return (uintptr_t)start - (uintptr_t)string;
|
||||||
|
|
||||||
|
end = find_valid_and_complete_utf8_sequence (start);
|
||||||
|
g_assert ((uintptr_t)end > (uintptr_t)start);
|
||||||
|
l = invalidly_encoded_string_to_utf16_get_output_length (start, end);
|
||||||
|
if (output_length < l)
|
||||||
|
return count + invalidly_encoded_string_to_utf16_backtrack (start, output_length);
|
||||||
|
count += (uintptr_t)end - (uintptr_t)start;
|
||||||
|
output_length -= l;
|
||||||
|
start = end;
|
||||||
|
|
||||||
|
if (start[0] == '\0')
|
||||||
|
return (uintptr_t)start - (uintptr_t)string;
|
||||||
|
}
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static size_t
|
||||||
|
utf8_to_utf16_make_valid (const char *string,
|
||||||
|
gunichar2 *output)
|
||||||
|
{
|
||||||
|
const char *start = string;
|
||||||
|
size_t count = 0;
|
||||||
|
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
const char *end = NULL;
|
||||||
|
|
||||||
|
end = find_invalid_or_incomplete_utf8_sequence (start);
|
||||||
|
count += valid_utf8_to_utf16 (start, end, &output[count]);
|
||||||
|
start = end;
|
||||||
|
|
||||||
|
if (start[0] == '\0')
|
||||||
|
break;
|
||||||
|
|
||||||
|
end = find_valid_and_complete_utf8_sequence (start);
|
||||||
|
g_assert ((uintptr_t)end > (uintptr_t)start);
|
||||||
|
count += invalidly_encoded_string_to_utf16 (start, end, &output[count]);
|
||||||
|
start = end;
|
||||||
|
|
||||||
|
if (start[0] == '\0')
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** < private >
|
||||||
|
* g_utf8_to_utf16_make_valid:
|
||||||
|
*
|
||||||
|
* @utf8: source UTF-8 string. May contain invalid or incomplete sequences.
|
||||||
|
* @buffer: optional auxiliary buffer where the output UTF-16 string will be
|
||||||
|
* stored if large enough to hold the output. Callers can pass NULL,
|
||||||
|
* in which case the output buffer is allocated on the heap.
|
||||||
|
* @buffer_len: length, in count of gunichar2, of @buffer. This is used only
|
||||||
|
* if @buffer is not NULL.
|
||||||
|
* @out_utf16: pointer that will be set the to output string. If @buffer is
|
||||||
|
* long enough to hold the data, *out_utf16 will equal @buffer
|
||||||
|
* upon return; otherwise *out_utf16 will point to heap-allocated
|
||||||
|
* data, which must be freed using `g_free`.
|
||||||
|
* @out_utf16_len: pointer to size_t that will be set to the length of the
|
||||||
|
* output UTF-16 string on return, in count of gunichar2.
|
||||||
|
* Can be NULL.
|
||||||
|
*
|
||||||
|
* Performs conversion of an UTF-8 string that may contain invalid sequences
|
||||||
|
* to UTF-16.
|
||||||
|
*
|
||||||
|
* On return, the caller should check if *out_utf16 equals @buffer and call
|
||||||
|
* `g_free` accordingly.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
g_utf8_to_utf16_make_valid (const char *utf8,
|
||||||
|
gunichar2 *buffer,
|
||||||
|
size_t buffer_len,
|
||||||
|
gunichar2 **out_utf16,
|
||||||
|
size_t *out_utf16_len)
|
||||||
|
{
|
||||||
|
size_t output_length = utf8_to_utf16_make_valid_get_output_length (utf8);
|
||||||
|
|
||||||
|
if (output_length < buffer_len)
|
||||||
|
{
|
||||||
|
*out_utf16 = buffer;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* output_length cannot be greater than strlen (utf8), which
|
||||||
|
* is less than SIZE_MAX since utf8 is null-terminated.
|
||||||
|
* As such, (output_length + 1) cannot overflow.
|
||||||
|
*/
|
||||||
|
*out_utf16 = g_new (gunichar2, output_length + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
utf8_to_utf16_make_valid (utf8, *out_utf16);
|
||||||
|
|
||||||
|
/* Add the terminating NULL character */
|
||||||
|
(*out_utf16)[output_length] = L'\0';
|
||||||
|
|
||||||
|
if (out_utf16_len)
|
||||||
|
*out_utf16_len = output_length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** < private >
|
||||||
|
* g_utf8_to_utf16_make_valid_backtrack:
|
||||||
|
*
|
||||||
|
* @utf8: source UTF-8 string. May contain invalid or incomplete sequences.
|
||||||
|
* @utf16_len: length within the output UTF-16 string expressed as a count
|
||||||
|
* of gunichar2.
|
||||||
|
*
|
||||||
|
* Backtracks an output-length in count of gunichar2 to the
|
||||||
|
* corresponding length, in bytes, of the source string.
|
||||||
|
*/
|
||||||
|
size_t
|
||||||
|
g_utf8_to_utf16_make_valid_backtrack (const char *utf8,
|
||||||
|
size_t utf16_len)
|
||||||
|
{
|
||||||
|
return utf8_to_utf16_make_valid_backtrack (utf8, utf16_len);
|
||||||
|
}
|
||||||
|
|
||||||
/* SIMD-based UTF-8 validation originates in the c-utf8 project from
|
/* SIMD-based UTF-8 validation originates in the c-utf8 project from
|
||||||
* https://github.com/c-util/c-utf8/ from the following authors:
|
* https://github.com/c-util/c-utf8/ from the following authors:
|
||||||
*
|
*
|
||||||
|
Reference in New Issue
Block a user