Add g_utf8_to_utf16_make_valid() with backtrack functionality

2025-11-23 18:58:56 +01:00 · 2025-05-29 17:03:36 +02:00
parent e57cbd8b47
commit c4be7824d5
2 changed files with 409 additions and 0 deletions
--- a/glib/gunicodeprivate.h
+++ b/glib/gunicodeprivate.h
@@ -22,6 +22,7 @@
 #define __G_UNICODE_PRIVATE_H__
 #include "gtypes.h"
 #include "gunicode.h"
 G_BEGIN_DECLS
@@ -29,6 +30,17 @@ gunichar *_g_utf8_normalize_wc (const gchar    *str,
                                gssize          max_len,
 				GNormalizeMode  mode);
 void
 g_utf8_to_utf16_make_valid (const char  *utf8,
                            gunichar2   *buffer,
                            size_t       buffer_len,
                            gunichar2  **out_utf16,
                            size_t      *out_utf16_len);
 size_t
 g_utf8_to_utf16_make_valid_backtrack (const char *utf8,
                                      size_t      utf16_len);
 G_END_DECLS
 #endif /* __G_UNICODE_PRIVATE_H__ */
--- a/glib/gutf8.c
+++ b/glib/gutf8.c
@@ -27,6 +27,7 @@
 #include <langinfo.h>
 #endif
 #include <string.h>
 #include <stdbool.h>
 #ifdef G_PLATFORM_WIN32
 #include <stdio.h>
@@ -41,6 +42,7 @@
 #include "gthread.h"
 #include "glibintl.h"
 #include "gvalgrind.h"
 #include "gunicodeprivate.h"
 #define UTF8_COMPUTE(Char, Mask, Len)					      \
  if (Char < 128)							      \
@@ -1574,6 +1576,401 @@ g_ucs4_to_utf16 (const gunichar  *str,
  return result;
 }
 /**< private >
 * find_invalid_or_incomplete_utf8_sequence:
 *
 * @string: the source string.
 *
 * Returns the first byte of a sequence that is either invalid
 * UTF-8 or incomplete UTF-8, or a pointer to the NULL terminator
 * if all of @string is valid UTF-8.
 */
 static const char *
 find_invalid_or_incomplete_utf8_sequence (const char *string)
 {
  const char *end = string;
  g_utf8_validate (string, -1, &end);
  return end;
 }
 /**< private >
 * find_valid_and_complete_utf8_sequence:
 *
 * @string: a NULL-terminated source string.
 *
 * Returns the first byte of a sequence that is valid (and complete)
 * UTF-8, or a pointer to the NULL terminator if no such sequence
 * could be found.
 */
 static const char *
 find_valid_and_complete_utf8_sequence (const char *string)
 {
  const unsigned char *iter = (const unsigned char *)string;
  for (;; iter++)
    {
      if (*iter < 128 ||
          ((*iter & 0xC0) == 0xC0 &&
           g_utf8_get_char_validated ((const char*)iter, -1) < (gunichar2)-2))
        {
          break;
        }
    }
  return (const char *) iter;
 }
 /**< private >
 * invalidly_encoded_string_to_utf16_get_output_length:
 *
 * @start: start of the source string.
 * @end: end of the source string (excluded).
 *
 * Returns the output length, as a count of gunichar2, that is necessary
 * for the generic translation of an invalidly-encoded string to UTF-16.
 */
 static size_t
 invalidly_encoded_string_to_utf16_get_output_length (const char *start,
                                                     const char *end)
 {
  size_t count;
  g_assert ((uintptr_t)end >= (uintptr_t)start);
  /* We output one gunichar2 for each input byte */
  count = (uintptr_t)end - (uintptr_t)start;
  return count;
 }
 /**< private >
 * invalidly_encoded_string_to_utf16:
 *
 * @start: start of the string.
 * @end: end of the string (excluded).
 * @output: the output buffer. Must be long enough to hold
 *          the entire output.
 *
 * Performs a generic conversion of an invalidly-encoded string
 * to UTF-16. Note: the current implementation simply outputs
 * Unicode Replacement Characters "<22>" (U+FFFD) for each byte in
 * the source string.
 */
 static size_t
 invalidly_encoded_string_to_utf16 (const char *start,
                                   const char *end,
                                   gunichar2  *output)
 {
  size_t count;
  g_assert ((uintptr_t)end >= (uintptr_t)start);
  count = (uintptr_t)end - (uintptr_t)start;
  for (size_t i = 0; i < count; i++)
    output[i] = 0xFFFD;
  return count;
 }
 /**< private >
 * invalidly_encoded_string_to_utf16_backtrack:
 *
 * @start: start of the source string.
 * @output_length: length within the output UTF-16 string
 *                 expressed as a count of gunichar2.
 *
 * Backtracks an output-length in count of gunichar2 to the
 * corresponding length, in bytes, of the source string.
 */
 static size_t
 invalidly_encoded_string_to_utf16_backtrack (const char *start,
                                             size_t      output_length)
 {
  /* The conversion process outputs one gunichar2 (a complete
   * character) for each input byte, so the mapping is very
   * simple.
   */
  return output_length;
 }
 /**< private >
 * valid_utf8_to_utf16_get_output_length:
 *
 * @start: start of the source string. Must be valid UTF-8.
 * @end: end of the source string (excluded).
 *
 * Returns the output-length, in count of gunichar2, necessary for the
 * translation of a valid UTF-8 string to UTF-16.
 */
 static size_t
 valid_utf8_to_utf16_get_output_length (const char *start,
                                       const char *end)
 {
  size_t count = 0;
  while (start < end)
    {
      gunichar codepoint = g_utf8_get_char (start);
      if (codepoint <= 0xFFFF)
        count += 1;
      else
        count += 2;
      start = g_utf8_next_char (start);
    }
  g_assert (start == end);
  return count;
 }
 /**< private >
 * valid_utf8_to_utf16:
 *
 * @start: start of the source string. Must be valid UTF-8
 * @end: end of the source string (excluded).
 * @output: the output buffer. Must be long enough to hold
 *          the entire output.
 *
 * Performs the conversion of a valid UTF-8 string to UTF-16.
 */
 static size_t
 valid_utf8_to_utf16 (const char *start,
                     const char *end,
                     gunichar2  *output)
 {
  size_t count = 0;
  while (start < end)
    {
      gunichar codepoint = g_utf8_get_char (start);
      if (codepoint <= 0xFFFF)
        {
          output[count++] = (gunichar2) codepoint;
        }
      else
        {
          gunichar subtract = codepoint - 0x010000;
          output[count++] = 0xD800 + ((subtract >> 10) & 0x3FF);
          output[count++] = 0xDC00 + (subtract & 0x3FF);
        }
      start = g_utf8_next_char (start);
    }
  g_assert (start == end);
  return count;
 }
 /**< private >
 * valid_utf8_to_utf16_backtrack:
 *
 * @start: start of the source string. Must be valid UTF-8.
 * @output_length: length within the output UTF-16 string expressed
 *                 as a count of gunichar2.
 *
 * Backtracks an output-length in count of gunichar2 to the
 * corresponding length, in bytes, of the source string.
 */
 static size_t
 valid_utf8_to_utf16_backtrack (const char *start,
                               size_t      output_length)
 {
  const char *iter = start;
  size_t count = 0;
  for (; *iter != '\0'; iter = g_utf8_next_char (iter))
    {
      if (output_length <= count)
        break;
      if (g_utf8_get_char (iter) <= 0xFFFF)
        count += 1;
      else
        count += 2;
    }
  return (uintptr_t)iter - (uintptr_t)start;
 }
 static size_t
 utf8_to_utf16_make_valid_get_output_length (const char *string)
 {
  const char *start = string;
  size_t count = 0;
  while (true)
    {
      const char *end = NULL;
      end = find_invalid_or_incomplete_utf8_sequence (start);
      count += valid_utf8_to_utf16_get_output_length (start, end);
      start = end;
      if (start[0] == '\0')
        break;
      end = find_valid_and_complete_utf8_sequence (start);
      g_assert ((uintptr_t)end > (uintptr_t)start);
      count += invalidly_encoded_string_to_utf16_get_output_length (start, end);
      start = end;
      if (start[0] == '\0')
        break;
    }
  return count;
 }
 static size_t
 utf8_to_utf16_make_valid_backtrack (const char *string,
                                    size_t      output_length)
 {
  const char *start = string;
  size_t count = 0;
  size_t l;
  while (true)
    {
      const char *end = NULL;
      end = find_invalid_or_incomplete_utf8_sequence (start);
      l = valid_utf8_to_utf16_get_output_length (start, end);
      if (output_length < count + l)
        return count + valid_utf8_to_utf16_backtrack (start, output_length);
      count += (uintptr_t)end - (uintptr_t)start;
      output_length -= l;
      start = end;
      if (start[0] == '\0')
        return (uintptr_t)start - (uintptr_t)string;
      end = find_valid_and_complete_utf8_sequence (start);
      g_assert ((uintptr_t)end > (uintptr_t)start);
      l = invalidly_encoded_string_to_utf16_get_output_length (start, end);
      if (output_length < l)
        return count + invalidly_encoded_string_to_utf16_backtrack (start, output_length);
      count += (uintptr_t)end - (uintptr_t)start;
      output_length -= l;
      start = end;
      if (start[0] == '\0')
        return (uintptr_t)start - (uintptr_t)string;
    }
  return count;
 }
 static size_t
 utf8_to_utf16_make_valid (const char *string,
                          gunichar2  *output)
 {
  const char *start = string;
  size_t count = 0;
  while (true)
    {
      const char *end = NULL;
      end = find_invalid_or_incomplete_utf8_sequence (start);
      count += valid_utf8_to_utf16 (start, end, &output[count]);
      start = end;
      if (start[0] == '\0')
        break;
      end = find_valid_and_complete_utf8_sequence (start);
      g_assert ((uintptr_t)end > (uintptr_t)start);
      count += invalidly_encoded_string_to_utf16 (start, end, &output[count]);
      start = end;
      if (start[0] == '\0')
        break;
    }
  return count;
 }
 /** < private >
 * g_utf8_to_utf16_make_valid:
 *
 * @utf8: source UTF-8 string. May contain invalid or incomplete sequences.
 * @buffer: optional auxiliary buffer where the output UTF-16 string will be
 *          stored if large enough to hold the output. Callers can pass NULL,
 *          in which case the output buffer is allocated on the heap.
 * @buffer_len: length, in count of gunichar2, of @buffer. This is used only
 *              if @buffer is not NULL.
 * @out_utf16: pointer that will be set the to output string. If @buffer is
 *             long enough to hold the data, *out_utf16 will equal @buffer
 *             upon return; otherwise *out_utf16 will point to heap-allocated
 *             data, which must be freed using `g_free`.
 * @out_utf16_len: pointer to size_t that will be set to the length of the
 *                 output UTF-16 string on return, in count of gunichar2.
 *                 Can be NULL.
 *
 * Performs conversion of an UTF-8 string that may contain invalid sequences
 * to UTF-16.
 *
 * On return, the caller should check if *out_utf16 equals @buffer and call
 * `g_free` accordingly.
 */
 void
 g_utf8_to_utf16_make_valid (const char  *utf8,
                            gunichar2   *buffer,
                            size_t       buffer_len,
                            gunichar2  **out_utf16,
                            size_t      *out_utf16_len)
 {
  size_t output_length = utf8_to_utf16_make_valid_get_output_length (utf8);
  if (output_length < buffer_len)
    {
      *out_utf16 = buffer;
    }
  else
    {
      /* output_length cannot be greater than strlen (utf8), which
       * is less than SIZE_MAX since utf8 is null-terminated.
       * As such, (output_length + 1) cannot overflow.
       */
      *out_utf16 = g_new (gunichar2, output_length + 1);
    }
  utf8_to_utf16_make_valid (utf8, *out_utf16);
  /* Add the terminating NULL character */
  (*out_utf16)[output_length] = L'\0';
  if (out_utf16_len)
    *out_utf16_len = output_length;
 }
 /** < private >
 * g_utf8_to_utf16_make_valid_backtrack:
 *
 * @utf8: source UTF-8 string. May contain invalid or incomplete sequences.
 * @utf16_len: length within the output UTF-16 string expressed as a count
 *             of gunichar2.
 *
 * Backtracks an output-length in count of gunichar2 to the
 * corresponding length, in bytes, of the source string.
 */
 size_t
 g_utf8_to_utf16_make_valid_backtrack (const char  *utf8,
                                      size_t       utf16_len)
 {
  return utf8_to_utf16_make_valid_backtrack (utf8, utf16_len);
 }
 /* SIMD-based UTF-8 validation originates in the c-utf8 project from
 * https://github.com/c-util/c-utf8/ from the following authors:
 *