Add g_utf8_to_utf16_make_valid() with backtrack functionality

2025-11-23 10:49:02 +01:00 · 2025-05-29 17:03:36 +02:00
parent e57cbd8b47
commit c4be7824d5
2 changed files with 409 additions and 0 deletions
--- a/glib/gunicodeprivate.h
+++ b/glib/gunicodeprivate.h
@@ -22,6 +22,7 @@
 #define __G_UNICODE_PRIVATE_H__

 #include "gtypes.h"
+#include "gunicode.h"

 G_BEGIN_DECLS

@@ -29,6 +30,17 @@ gunichar *_g_utf8_normalize_wc (const gchar    *str,
                                gssize          max_len,
 				GNormalizeMode  mode);

+void
+g_utf8_to_utf16_make_valid (const char  *utf8,
+                            gunichar2   *buffer,
+                            size_t       buffer_len,
+                            gunichar2  **out_utf16,
+                            size_t      *out_utf16_len);
+
+size_t
+g_utf8_to_utf16_make_valid_backtrack (const char *utf8,
+                                      size_t      utf16_len);
+
 G_END_DECLS

 #endif /* __G_UNICODE_PRIVATE_H__ */
--- a/glib/gutf8.c
+++ b/glib/gutf8.c
@@ -27,6 +27,7 @@
 #include <langinfo.h>
 #endif
 #include <string.h>
+#include <stdbool.h>

 #ifdef G_PLATFORM_WIN32
 #include <stdio.h>
@@ -41,6 +42,7 @@
 #include "gthread.h"
 #include "glibintl.h"
 #include "gvalgrind.h"
+#include "gunicodeprivate.h"

 #define UTF8_COMPUTE(Char, Mask, Len)					      \
  if (Char < 128)							      \
@@ -1574,6 +1576,401 @@ g_ucs4_to_utf16 (const gunichar  *str,
  return result;
 }

+/**< private >
+ * find_invalid_or_incomplete_utf8_sequence:
+ *
+ * @string: the source string.
+ *
+ * Returns the first byte of a sequence that is either invalid
+ * UTF-8 or incomplete UTF-8, or a pointer to the NULL terminator
+ * if all of @string is valid UTF-8.
+ */
+static const char *
+find_invalid_or_incomplete_utf8_sequence (const char *string)
+{
+  const char *end = string;
+
+  g_utf8_validate (string, -1, &end);
+
+  return end;
+}
+
+/**< private >
+ * find_valid_and_complete_utf8_sequence:
+ *
+ * @string: a NULL-terminated source string.
+ *
+ * Returns the first byte of a sequence that is valid (and complete)
+ * UTF-8, or a pointer to the NULL terminator if no such sequence
+ * could be found.
+ */
+static const char *
+find_valid_and_complete_utf8_sequence (const char *string)
+{
+  const unsigned char *iter = (const unsigned char *)string;
+
+  for (;; iter++)
+    {
+      if (*iter < 128 ||
+          ((*iter & 0xC0) == 0xC0 &&
+           g_utf8_get_char_validated ((const char*)iter, -1) < (gunichar2)-2))
+        {
+          break;
+        }
+    }
+
+  return (const char *) iter;
+}
+
+
+/**< private >
+ * invalidly_encoded_string_to_utf16_get_output_length:
+ *
+ * @start: start of the source string.
+ * @end: end of the source string (excluded).
+ *
+ * Returns the output length, as a count of gunichar2, that is necessary
+ * for the generic translation of an invalidly-encoded string to UTF-16.
+ */
+static size_t
+invalidly_encoded_string_to_utf16_get_output_length (const char *start,
+                                                     const char *end)
+{
+  size_t count;
+
+  g_assert ((uintptr_t)end >= (uintptr_t)start);
+
+  /* We output one gunichar2 for each input byte */
+  count = (uintptr_t)end - (uintptr_t)start;
+
+  return count;
+}
+
+/**< private >
+ * invalidly_encoded_string_to_utf16:
+ *
+ * @start: start of the string.
+ * @end: end of the string (excluded).
+ * @output: the output buffer. Must be long enough to hold
+ *          the entire output.
+ *
+ * Performs a generic conversion of an invalidly-encoded string
+ * to UTF-16. Note: the current implementation simply outputs
+ * Unicode Replacement Characters "<22>" (U+FFFD) for each byte in
+ * the source string.
+ */
+static size_t
+invalidly_encoded_string_to_utf16 (const char *start,
+                                   const char *end,
+                                   gunichar2  *output)
+{
+  size_t count;
+
+  g_assert ((uintptr_t)end >= (uintptr_t)start);
+  count = (uintptr_t)end - (uintptr_t)start;
+
+  for (size_t i = 0; i < count; i++)
+    output[i] = 0xFFFD;
+
+  return count;
+}
+
+/**< private >
+ * invalidly_encoded_string_to_utf16_backtrack:
+ *
+ * @start: start of the source string.
+ * @output_length: length within the output UTF-16 string
+ *                 expressed as a count of gunichar2.
+ *
+ * Backtracks an output-length in count of gunichar2 to the
+ * corresponding length, in bytes, of the source string.
+ */
+static size_t
+invalidly_encoded_string_to_utf16_backtrack (const char *start,
+                                             size_t      output_length)
+{
+  /* The conversion process outputs one gunichar2 (a complete
+   * character) for each input byte, so the mapping is very
+   * simple.
+   */
+  return output_length;
+}
+
+
+/**< private >
+ * valid_utf8_to_utf16_get_output_length:
+ *
+ * @start: start of the source string. Must be valid UTF-8.
+ * @end: end of the source string (excluded).
+ *
+ * Returns the output-length, in count of gunichar2, necessary for the
+ * translation of a valid UTF-8 string to UTF-16.
+ */
+static size_t
+valid_utf8_to_utf16_get_output_length (const char *start,
+                                       const char *end)
+{
+  size_t count = 0;
+
+  while (start < end)
+    {
+      gunichar codepoint = g_utf8_get_char (start);
+
+      if (codepoint <= 0xFFFF)
+        count += 1;
+      else
+        count += 2;
+
+      start = g_utf8_next_char (start);
+    }
+
+  g_assert (start == end);
+
+  return count;
+}
+
+/**< private >
+ * valid_utf8_to_utf16:
+ *
+ * @start: start of the source string. Must be valid UTF-8
+ * @end: end of the source string (excluded).
+ * @output: the output buffer. Must be long enough to hold
+ *          the entire output.
+ *
+ * Performs the conversion of a valid UTF-8 string to UTF-16.
+ */
+static size_t
+valid_utf8_to_utf16 (const char *start,
+                     const char *end,
+                     gunichar2  *output)
+{
+  size_t count = 0;
+
+  while (start < end)
+    {
+      gunichar codepoint = g_utf8_get_char (start);
+
+      if (codepoint <= 0xFFFF)
+        {
+          output[count++] = (gunichar2) codepoint;
+        }
+      else
+        {
+          gunichar subtract = codepoint - 0x010000;
+          output[count++] = 0xD800 + ((subtract >> 10) & 0x3FF);
+          output[count++] = 0xDC00 + (subtract & 0x3FF);
+        }
+
+      start = g_utf8_next_char (start);
+    }
+
+  g_assert (start == end);
+
+  return count;
+}
+
+/**< private >
+ * valid_utf8_to_utf16_backtrack:
+ *
+ * @start: start of the source string. Must be valid UTF-8.
+ * @output_length: length within the output UTF-16 string expressed
+ *                 as a count of gunichar2.
+ *
+ * Backtracks an output-length in count of gunichar2 to the
+ * corresponding length, in bytes, of the source string.
+ */
+static size_t
+valid_utf8_to_utf16_backtrack (const char *start,
+                               size_t      output_length)
+{
+  const char *iter = start;
+  size_t count = 0;
+
+  for (; *iter != '\0'; iter = g_utf8_next_char (iter))
+    {
+      if (output_length <= count)
+        break;
+
+      if (g_utf8_get_char (iter) <= 0xFFFF)
+        count += 1;
+      else
+        count += 2;
+    }
+
+  return (uintptr_t)iter - (uintptr_t)start;
+}
+
+
+static size_t
+utf8_to_utf16_make_valid_get_output_length (const char *string)
+{
+  const char *start = string;
+  size_t count = 0;
+
+  while (true)
+    {
+      const char *end = NULL;
+
+      end = find_invalid_or_incomplete_utf8_sequence (start);
+      count += valid_utf8_to_utf16_get_output_length (start, end);
+      start = end;
+
+      if (start[0] == '\0')
+        break;
+
+      end = find_valid_and_complete_utf8_sequence (start);
+      g_assert ((uintptr_t)end > (uintptr_t)start);
+      count += invalidly_encoded_string_to_utf16_get_output_length (start, end);
+      start = end;
+
+      if (start[0] == '\0')
+        break;
+    }
+
+  return count;
+}
+
+static size_t
+utf8_to_utf16_make_valid_backtrack (const char *string,
+                                    size_t      output_length)
+{
+  const char *start = string;
+  size_t count = 0;
+  size_t l;
+
+  while (true)
+    {
+      const char *end = NULL;
+
+      end = find_invalid_or_incomplete_utf8_sequence (start);
+      l = valid_utf8_to_utf16_get_output_length (start, end);
+      if (output_length < count + l)
+        return count + valid_utf8_to_utf16_backtrack (start, output_length);
+      count += (uintptr_t)end - (uintptr_t)start;
+      output_length -= l;
+      start = end;
+
+      if (start[0] == '\0')
+        return (uintptr_t)start - (uintptr_t)string;
+
+      end = find_valid_and_complete_utf8_sequence (start);
+      g_assert ((uintptr_t)end > (uintptr_t)start);
+      l = invalidly_encoded_string_to_utf16_get_output_length (start, end);
+      if (output_length < l)
+        return count + invalidly_encoded_string_to_utf16_backtrack (start, output_length);
+      count += (uintptr_t)end - (uintptr_t)start;
+      output_length -= l;
+      start = end;
+
+      if (start[0] == '\0')
+        return (uintptr_t)start - (uintptr_t)string;
+    }
+
+  return count;
+}
+
+
+static size_t
+utf8_to_utf16_make_valid (const char *string,
+                          gunichar2  *output)
+{
+  const char *start = string;
+  size_t count = 0;
+
+  while (true)
+    {
+      const char *end = NULL;
+
+      end = find_invalid_or_incomplete_utf8_sequence (start);
+      count += valid_utf8_to_utf16 (start, end, &output[count]);
+      start = end;
+
+      if (start[0] == '\0')
+        break;
+
+      end = find_valid_and_complete_utf8_sequence (start);
+      g_assert ((uintptr_t)end > (uintptr_t)start);
+      count += invalidly_encoded_string_to_utf16 (start, end, &output[count]);
+      start = end;
+
+      if (start[0] == '\0')
+        break;
+    }
+
+  return count;
+}
+
+/** < private >
+ * g_utf8_to_utf16_make_valid:
+ *
+ * @utf8: source UTF-8 string. May contain invalid or incomplete sequences.
+ * @buffer: optional auxiliary buffer where the output UTF-16 string will be
+ *          stored if large enough to hold the output. Callers can pass NULL,
+ *          in which case the output buffer is allocated on the heap.
+ * @buffer_len: length, in count of gunichar2, of @buffer. This is used only
+ *              if @buffer is not NULL.
+ * @out_utf16: pointer that will be set the to output string. If @buffer is
+ *             long enough to hold the data, *out_utf16 will equal @buffer
+ *             upon return; otherwise *out_utf16 will point to heap-allocated
+ *             data, which must be freed using `g_free`.
+ * @out_utf16_len: pointer to size_t that will be set to the length of the
+ *                 output UTF-16 string on return, in count of gunichar2.
+ *                 Can be NULL.
+ *
+ * Performs conversion of an UTF-8 string that may contain invalid sequences
+ * to UTF-16.
+ *
+ * On return, the caller should check if *out_utf16 equals @buffer and call
+ * `g_free` accordingly.
+ */
+void
+g_utf8_to_utf16_make_valid (const char  *utf8,
+                            gunichar2   *buffer,
+                            size_t       buffer_len,
+                            gunichar2  **out_utf16,
+                            size_t      *out_utf16_len)
+{
+  size_t output_length = utf8_to_utf16_make_valid_get_output_length (utf8);
+
+  if (output_length < buffer_len)
+    {
+      *out_utf16 = buffer;
+    }
+  else
+    {
+      /* output_length cannot be greater than strlen (utf8), which
+       * is less than SIZE_MAX since utf8 is null-terminated.
+       * As such, (output_length + 1) cannot overflow.
+       */
+      *out_utf16 = g_new (gunichar2, output_length + 1);
+    }
+
+  utf8_to_utf16_make_valid (utf8, *out_utf16);
+
+  /* Add the terminating NULL character */
+  (*out_utf16)[output_length] = L'\0';
+
+  if (out_utf16_len)
+    *out_utf16_len = output_length;
+}
+
+/** < private >
+ * g_utf8_to_utf16_make_valid_backtrack:
+ *
+ * @utf8: source UTF-8 string. May contain invalid or incomplete sequences.
+ * @utf16_len: length within the output UTF-16 string expressed as a count
+ *             of gunichar2.
+ *
+ * Backtracks an output-length in count of gunichar2 to the
+ * corresponding length, in bytes, of the source string.
+ */
+size_t
+g_utf8_to_utf16_make_valid_backtrack (const char  *utf8,
+                                      size_t       utf16_len)
+{
+  return utf8_to_utf16_make_valid_backtrack (utf8, utf16_len);
+}
+
 /* SIMD-based UTF-8 validation originates in the c-utf8 project from
 * https://github.com/c-util/c-utf8/ from the following authors:
 *