141 lines
4.2 KiB
Diff
141 lines
4.2 KiB
Diff
From aacb3875a9a645880cbfe014fb0c4cb078ff4342 Mon Sep 17 00:00:00 2001
|
|
From: Jan Engelhardt <jengelh@inai.de>
|
|
Date: Mon, 17 Sep 2018 21:27:43 +0200
|
|
Subject: [PATCH 4/5] vobject.c - vCard Unicode reading support
|
|
References: https://github.com/libical/libical/pull/354
|
|
|
|
RFC 6350 declares vCard to be UTF-8 throughout without exceptions.
|
|
|
|
However, any non-ASCII vCard content is garbled because the
|
|
"fakeUnicode" botched the conversion to wchar_t: The conversion just
|
|
copies values from char to wchar_t, which is neither correct for
|
|
UTF-8 nor (a hypothetical) ISO-8859-1/-15 coded input.
|
|
|
|
This patch fixes that.
|
|
|
|
References: #353
|
|
---
|
|
src/libicalvcal/vobject.c | 94 ++++++++++++++++++++++++++++++++-------
|
|
1 file changed, 78 insertions(+), 16 deletions(-)
|
|
|
|
diff --git a/src/libicalvcal/vobject.c b/src/libicalvcal/vobject.c
|
|
index 10d0cf5a..b880716f 100644
|
|
--- a/src/libicalvcal/vobject.c
|
|
+++ b/src/libicalvcal/vobject.c
|
|
@@ -45,6 +45,9 @@ DFARS 252.227-7013 or 48 CFR 52.227-19, as applicable.
|
|
#ifdef HAVE_CONFIG_H
|
|
#include <config.h>
|
|
#endif
|
|
+#include <errno.h>
|
|
+#include <iconv.h>
|
|
+#include <stdint.h>
|
|
|
|
#include "vobject.h"
|
|
|
|
@@ -1414,27 +1417,86 @@ char* writeMemVObjects(char *s, int *len, VObject *list)
|
|
/*----------------------------------------------------------------------
|
|
APIs to do fake Unicode stuff.
|
|
----------------------------------------------------------------------*/
|
|
+/*
|
|
+ * Convert UTF-8 to wide chars.
|
|
+ *
|
|
+ * The only place where this spells Unicode is 1.) in "UTF-8", 2.) when it does
|
|
+ * the secondary pass to replace \n and \r with U+2028 and 2029, respectively.
|
|
+ * That step blindly pretends wchar_t shares the Unicode codepoints (happens to
|
|
+ * work for the important contemporary platforms, but otherwise is nonsense).
|
|
+ */
|
|
wchar_t* fakeUnicode(const char *ps, size_t *bytes)
|
|
{
|
|
- wchar_t *r, *pw;
|
|
- size_t len = strlen(ps)+1;
|
|
+ /*
|
|
+ * Assuming the input were all ASCII, then
|
|
+ *
|
|
+ * method1_out_size = zs * sizeof(wchar_t)
|
|
+ *
|
|
+ * would make sense. But if the input were all 3-byte UTF-8 codepoints,
|
|
+ * then that would be a large wasteful allocation, and
|
|
+ *
|
|
+ * method2_out_size = zs * sizeof(wchar_t) / 3
|
|
+ *
|
|
+ * would be more reasonable. Since there is no way of knowing in
|
|
+ * advance what is in @ps, method 1 will be chosen if that is a 1KB
|
|
+ * allocation (or less), and method 2 otherwise. From there, the
|
|
+ * standard exponential progression for realloc is applied.
|
|
+ */
|
|
+ size_t zs = strlen(ps), out_size, out_rem;
|
|
+ char *out_block, *out_iter;
|
|
+ iconv_t conv = iconv_open("wchar_t", "utf-8");
|
|
|
|
- pw = r = (wchar_t*)malloc(sizeof(wchar_t)*len);
|
|
- if (bytes)
|
|
- *bytes = len * sizeof(wchar_t);
|
|
+ if (conv == (iconv_t)-1)
|
|
+ return NULL;
|
|
+ if (zs >= (SIZE_MAX - sizeof(wchar_t)) / sizeof(wchar_t))
|
|
+ /* Input is larger than anything we want to handle */
|
|
+ return NULL;
|
|
+ /* Initial allocation size as per above. */
|
|
+ out_size = out_rem = zs * sizeof(wchar_t);
|
|
+ if (out_size >= 1024 - sizeof(wchar_t))
|
|
+ out_size /= 3;
|
|
+ out_iter = out_block = malloc(out_size + sizeof(wchar_t));
|
|
+ if (out_block == NULL) {
|
|
+ iconv_close(conv);
|
|
+ return NULL;
|
|
+ }
|
|
|
|
- while (*ps) {
|
|
- if (*ps == '\n')
|
|
- *pw = (wchar_t)0x2028;
|
|
- else if (*ps == '\r')
|
|
- *pw = (wchar_t)0x2029;
|
|
- else
|
|
- *pw = (wchar_t)(unsigned char)*ps;
|
|
- ps++; pw++;
|
|
- }
|
|
- *pw = (wchar_t)0;
|
|
+ while (zs > 0) {
|
|
+ int ret;
|
|
+ errno = 0;
|
|
+ ret = iconv(conv, (char **)&ps, &zs, &out_iter, &out_rem);
|
|
+ if (ret >= 0)
|
|
+ continue;
|
|
+ if (errno == EILSEQ || errno == EINVAL) {
|
|
+ ++ps;
|
|
+ --zs;
|
|
+ continue;
|
|
+ }
|
|
+ if (errno != E2BIG)
|
|
+ break;
|
|
+ out_rem += out_size;
|
|
+ out_size *= 2;
|
|
+ char *new_block = realloc(out_block, out_size + sizeof(wchar_t));
|
|
+ if (new_block == NULL) {
|
|
+ free(out_block);
|
|
+ iconv_close(conv);
|
|
+ return NULL;
|
|
+ }
|
|
+ out_iter = new_block + (out_iter - out_block);
|
|
+ out_block = new_block;
|
|
+ }
|
|
|
|
- return r;
|
|
+ wchar_t *wide = (wchar_t *)out_block, *p = wide;
|
|
+ for (; p < (wchar_t *)(out_block + out_size - out_rem); ++p) {
|
|
+ if (*p == '\n')
|
|
+ *p = 0x2028;
|
|
+ else if (*p == '\r')
|
|
+ *p = 0x2029;
|
|
+ }
|
|
+ *p = L'\0';
|
|
+ if (bytes != NULL)
|
|
+ *bytes = (char *)p - out_block;
|
|
+ return wide;
|
|
}
|
|
|
|
int uStrLen(const wchar_t *u)
|
|
--
|
|
2.19.1
|
|
|