libical/0004-vobject.c-vCard-Unicode-reading-support.patch

From aacb3875a9a645880cbfe014fb0c4cb078ff4342 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@inai.de>
Date: Mon, 17 Sep 2018 21:27:43 +0200
Subject: [PATCH 4/5] vobject.c - vCard Unicode reading support
References: https://github.com/libical/libical/pull/354

RFC 6350 declares vCard to be UTF-8 throughout without exceptions.

However, any non-ASCII vCard content is garbled because the
"fakeUnicode" botched the conversion to wchar_t: The conversion just
copies values from char to wchar_t, which is neither correct for
UTF-8 nor (a hypothetical) ISO-8859-1/-15 coded input.

This patch fixes that.

References: #353
---
 src/libicalvcal/vobject.c | 94 ++++++++++++++++++++++++++++++++-------
 1 file changed, 78 insertions(+), 16 deletions(-)

diff --git a/src/libicalvcal/vobject.c b/src/libicalvcal/vobject.c
index 10d0cf5a..b880716f 100644
--- a/src/libicalvcal/vobject.c
+++ b/src/libicalvcal/vobject.c
@@ -45,6 +45,9 @@ DFARS 252.227-7013 or 48 CFR 52.227-19, as applicable.
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
+#include <errno.h>
+#include <iconv.h>
+#include <stdint.h>

 #include "vobject.h"

@@ -1414,27 +1417,86 @@ char* writeMemVObjects(char *s, int *len, VObject *list)
 /*----------------------------------------------------------------------
   APIs to do fake Unicode stuff.
   ----------------------------------------------------------------------*/
+/*
+ * Convert UTF-8 to wide chars.
+ *
+ * The only place where this spells Unicode is 1.) in "UTF-8", 2.) when it does
+ * the secondary pass to replace \n and \r with U+2028 and 2029, respectively.
+ * That step blindly pretends wchar_t shares the Unicode codepoints (happens to
+ * work for the important contemporary platforms, but otherwise is nonsense).
+ */
 wchar_t* fakeUnicode(const char *ps, size_t *bytes)
 {
-    wchar_t *r, *pw;
-    size_t len = strlen(ps)+1;
+	/*
+	 * Assuming the input were all ASCII, then
+	 *
+	 * 	method1_out_size = zs * sizeof(wchar_t)
+	 *
+	 * would make sense. But if the input were all 3-byte UTF-8 codepoints,
+	 * then that would be a large wasteful allocation, and
+	 *
+	 * 	method2_out_size = zs * sizeof(wchar_t) / 3
+	 *
+	 * would be more reasonable. Since there is no way of knowing in
+	 * advance what is in @ps, method 1 will be chosen if that is a 1KB
+	 * allocation (or less), and method 2 otherwise. From there, the
+	 * standard exponential progression for realloc is applied.
+	 */
+	size_t zs = strlen(ps), out_size, out_rem;
+	char *out_block, *out_iter;
+	iconv_t conv = iconv_open("wchar_t", "utf-8");

-    pw = r = (wchar_t*)malloc(sizeof(wchar_t)*len);
-    if (bytes)
-        *bytes = len * sizeof(wchar_t);
+	if (conv == (iconv_t)-1)
+		return NULL;
+	if (zs >= (SIZE_MAX - sizeof(wchar_t)) / sizeof(wchar_t))
+		/* Input is larger than anything we want to handle */
+		return NULL;
+	/* Initial allocation size as per above. */
+	out_size = out_rem = zs * sizeof(wchar_t);
+	if (out_size >= 1024 - sizeof(wchar_t))
+		out_size /= 3;
+	out_iter = out_block = malloc(out_size + sizeof(wchar_t));
+	if (out_block == NULL) {
+		iconv_close(conv);
+		return NULL;
+	}

-    while (*ps) {
-        if (*ps == '\n')
-            *pw = (wchar_t)0x2028;
-        else if (*ps == '\r')
-            *pw = (wchar_t)0x2029;
-        else
-            *pw = (wchar_t)(unsigned char)*ps;
-        ps++; pw++;
-        }
-    *pw = (wchar_t)0;
+	while (zs > 0) {
+		int ret;
+		errno = 0;
+		ret = iconv(conv, (char **)&ps, &zs, &out_iter, &out_rem);
+		if (ret >= 0)
+			continue;
+		if (errno == EILSEQ || errno == EINVAL) {
+			++ps;
+			--zs;
+			continue;
+		}
+		if (errno != E2BIG)
+			break;
+		out_rem  += out_size;
+		out_size *= 2;
+		char *new_block = realloc(out_block, out_size + sizeof(wchar_t));
+		if (new_block == NULL) {
+			free(out_block);
+			iconv_close(conv);
+			return NULL;
+		}
+		out_iter  = new_block + (out_iter - out_block);
+		out_block = new_block;
+	}

-    return r;
+	wchar_t *wide = (wchar_t *)out_block, *p = wide;
+	for (; p < (wchar_t *)(out_block + out_size - out_rem); ++p) {
+		if (*p == '\n')
+			*p = 0x2028;
+		else if (*p == '\r')
+			*p = 0x2029;
+	}
+	*p = L'\0';
+	if (bytes != NULL)
+		*bytes = (char *)p - out_block;
+	return wide;
 }

 int uStrLen(const wchar_t *u)
--
2.19.1