gconvert: Tighten, document embedded NUL behavior of UTF-8 conversions

The character encoding conversion utility functions g_locale_to_utf8() and g_filename_to_utf8() had inconsistent behavior on producing strings with inner NUL bytes: in the all-UTF-8 strdup path, the input string validation prohibits embedded NULs, while g_convert(), using iconv(), can produce UTF-8 output with NUL bytes inside the output buffer. This, while valid UTF-8 per the Unicode standard, is not valid for the nul-terminated (type utf8) return value format that the *_to_utf8() functions are annotated with (as per discussion in bug 756128). Check the output of g_convert() for embedded NUL bytes, and if any are found, set the newly introduced error G_CONVERT_ERROR_EMBEDDED_NUL. Also document the error set by g_{locale,filename}_{from,to}_utf8() when the input string contains nul bytes. https://bugzilla.gnome.org/show_bug.cgi?id=792516
2025-08-22 17:08:53 +02:00 · 2018-01-14 16:55:03 +02:00
parent 413605a6f3
commit 81cd815406
2 changed files with 76 additions and 14 deletions
--- a/glib/gconvert.c
+++ b/glib/gconvert.c
@@ -866,6 +866,40 @@ strdup_len (const gchar *string,
  return g_strndup (string, real_len);
 }

+static gchar *
+convert_to_utf8 (const gchar *opsysstring,
+                 gssize       len,
+                 const gchar *charset,
+                 gsize       *bytes_read,
+                 gsize       *bytes_written,
+                 GError     **error)
+{
+  gchar *utf8;
+  gsize outbytes;
+
+  utf8 = g_convert (opsysstring, len, "UTF-8", charset,
+                    bytes_read, &outbytes, error);
+  if (utf8 == NULL)
+    {
+      if (bytes_written)
+        *bytes_written = 0;
+      return NULL;
+    }
+  if (memchr (utf8, '\0', outbytes) != NULL)
+    {
+      g_free (utf8);
+      if (bytes_written)
+        *bytes_written = 0;
+      g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_EMBEDDED_NUL,
+                           _("Embedded NUL byte in conversion output"));
+      return NULL;
+    }
+
+  if (bytes_written)
+    *bytes_written = outbytes;
+  return utf8;
+}
+
 /**
 * g_locale_to_utf8:
 * @opsysstring:   a string in the encoding of the current locale. On Windows
@@ -879,7 +913,7 @@ strdup_len (const gchar *string,
 *                 Even if the conversion was successful, this may be 
 *                 less than @len if there were partial characters
 *                 at the end of the input. If the error
- *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
+ *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 *                 stored will the byte offset after the last valid
 *                 input sequence.
 * @bytes_written: (out) (optional): the number of bytes stored in the output
@@ -890,6 +924,14 @@ strdup_len (const gchar *string,
 * Converts a string which is in the encoding used for strings by
 * the C runtime (usually the same as that used by the operating
 * system) in the [current locale][setlocale] into a UTF-8 string.
+ *
+ * If the source encoding is not UTF-8 and the conversion output contains a
+ * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
+ * function returns %NULL.
+ * If the source encoding is UTF-8, an embedded nul character is treated with
+ * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with
+ * earlier versions of this library. Use g_convert() to produce output that
+ * may contain embedded nul characters.
 * 
 * Returns: A newly-allocated buffer containing the converted string,
 *               or %NULL on an error, and error will be set.
@@ -906,23 +948,21 @@ g_locale_to_utf8 (const gchar  *opsysstring,
  if (g_get_charset (&charset))
    return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
  else
-    return g_convert (opsysstring, len, 
-		      "UTF-8", charset, bytes_read, bytes_written, error);
+    return convert_to_utf8 (opsysstring, len, charset,
+                            bytes_read, bytes_written, error);
 }

 /**
 * g_locale_from_utf8:
 * @utf8string:    a UTF-8 encoded string 
 * @len:           the length of the string, or -1 if the string is
- *                 nul-terminated (Note that some encodings may allow nul
- *                 bytes to occur inside strings. In that case, using -1
- *                 for the @len parameter is unsafe)
+ *                 nul-terminated.
 * @bytes_read: (out) (optional): location to store the number of bytes in the
 *                 input string that were successfully converted, or %NULL.
 *                 Even if the conversion was successful, this may be 
 *                 less than @len if there were partial characters
 *                 at the end of the input. If the error
- *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
+ *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 *                 stored will the byte offset after the last valid
 *                 input sequence.
 * @bytes_written: (out) (optional): the number of bytes stored in the output
@@ -934,7 +974,12 @@ g_locale_to_utf8 (const gchar  *opsysstring,
 * the C runtime (usually the same as that used by the operating
 * system) in the [current locale][setlocale]. On Windows this means
 * the system codepage.
- * 
+ *
+ * The input string should not contain nul characters even if the @len
+ * argument is positive. A nul character found inside the string may result
+ * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Use g_convert() to convert
+ * input that may contain embedded nul characters.
+ *
 * Returns: A newly-allocated buffer containing the converted string,
 *               or %NULL on an error, and error will be set.
 **/
@@ -1126,7 +1171,7 @@ get_filename_charset (const gchar **filename_charset)
 *                 Even if the conversion was successful, this may be 
 *                 less than @len if there were partial characters
 *                 at the end of the input. If the error
- *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
+ *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 *                 stored will the byte offset after the last valid
 *                 input sequence.
 * @bytes_written: (out) (optional): the number of bytes stored in the output
@@ -1138,6 +1183,14 @@ get_filename_charset (const gchar **filename_charset)
 * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8
 * for filenames; on other platforms, this function indirectly depends on 
 * the [current locale][setlocale].
+ *
+ * If the source encoding is not UTF-8 and the conversion output contains a
+ * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
+ * function returns %NULL.
+ * If the source encoding is UTF-8, an embedded nul character is treated with
+ * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with
+ * earlier versions of this library. Use g_convert() to produce output that
+ * may contain embedded nul characters.
 * 
 * Returns: The converted string, or %NULL on an error.
 **/
@@ -1155,8 +1208,8 @@ g_filename_to_utf8 (const gchar *opsysstring,
  if (get_filename_charset (&charset))
    return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
  else
-    return g_convert (opsysstring, len, 
-		      "UTF-8", charset, bytes_read, bytes_written, error);
+    return convert_to_utf8 (opsysstring, len, charset,
+                            bytes_read, bytes_written, error);
 }

 /**
@@ -1169,7 +1222,7 @@ g_filename_to_utf8 (const gchar *opsysstring,
 *                 Even if the conversion was successful, this may be 
 *                 less than @len if there were partial characters
 *                 at the end of the input. If the error
- *                 #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
+ *                 %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
 *                 stored will the byte offset after the last valid
 *                 input sequence.
 * @bytes_written: (out): the number of bytes stored in the output buffer (not 
@@ -1181,7 +1234,12 @@ g_filename_to_utf8 (const gchar *opsysstring,
 * filenames. Note that on Windows GLib uses UTF-8 for filenames;
 * on other platforms, this function indirectly depends on the 
 * [current locale][setlocale].
- * 
+ *
+ * The input string should not contain nul characters even if the @len
+ * argument is positive. A nul character found inside the string may result
+ * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Note that nul bytes are
+ * prohibited in all filename encodings that GLib is known to work with.
+ *
 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
 *               The converted string, or %NULL on an error.
 **/
--- a/glib/gconvert.h
+++ b/glib/gconvert.h
@@ -43,6 +43,9 @@ G_BEGIN_DECLS
 * @G_CONVERT_ERROR_BAD_URI: URI is invalid.
 * @G_CONVERT_ERROR_NOT_ABSOLUTE_PATH: Pathname is not an absolute path.
 * @G_CONVERT_ERROR_NO_MEMORY: No memory available. Since: 2.40
+ * @G_CONVERT_ERROR_EMBEDDED_NUL: An embedded NUL character is present in
+ *     conversion output where a NUL-terminated string is expected.
+ *     Since: 2.56
 *
 * Error codes returned by character set conversion routines.
 */
@@ -54,7 +57,8 @@ typedef enum
  G_CONVERT_ERROR_PARTIAL_INPUT,
  G_CONVERT_ERROR_BAD_URI,
  G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
-  G_CONVERT_ERROR_NO_MEMORY
+  G_CONVERT_ERROR_NO_MEMORY,
+  G_CONVERT_ERROR_EMBEDDED_NUL
 } GConvertError;

 /**