json-c/json-c-0.9-json_tokener.patch

diff -Naur json-c-0.9.old//json_tokener.c json-c-0.9//json_tokener.c
--- json-c-0.9.old//json_tokener.c	2009-12-15 23:53:18.000000000 +0000
+++ json-c-0.9//json_tokener.c	2009-12-16 00:52:40.000000000 +0000
@@ -58,6 +58,11 @@
   "expected comment",
 };
 
+/* Stuff for decoding unicode sequences */
+#define IS_HIGH_SURROGATE(uc) (((uc) & 0xFC00) == 0xD800)
+#define IS_LOW_SURROGATE(uc)  (((uc) & 0xFC00) == 0xDC00)
+#define DECODE_SURROGATE_PAIR(hi,lo) ((((hi) & 0x3FF) << 10) + ((lo) & 0x3FF) + 0x10000)
+static unsigned char utf8_replacement_char[3] = { 0xEF, 0xBF, 0xBD };
 
 struct json_tokener* json_tokener_new(void)
 {
@@ -398,40 +403,94 @@
       break;
 
     case json_tokener_state_escape_unicode:
-            /* Note that the following code is inefficient for handling large
-       * chunks of extended chars, calling printbuf_memappend() once
-       * for each multi-byte character of input.
-       * This is a good area for future optimization.
-       */
 	{
-	  /* Advance until we change state */
+	  unsigned int got_hi_surrogate = 0;
+	  /* Handle a 4-byte sequence, or two sequences if a surrogate pair */
 	  while(1) {
 	    if(strchr(json_hex_chars, c)) {
 	      tok->ucs_char += ((unsigned int)hexdigit(c) << ((3-tok->st_pos++)*4));
 	      if(tok->st_pos == 4) {
-		unsigned char utf_out[3];
+                if (got_hi_surrogate) {
+		  if (IS_LOW_SURROGATE(tok->ucs_char)) {
+                    /* Recalculate the ucs_char, then fall thru to process normally */
+                    tok->ucs_char = DECODE_SURROGATE_PAIR(got_hi_surrogate, tok->ucs_char);
+                  } else {
+                    /* Hi surrogate was not followed by a low surrogate */
+                    /* Replace the hi and process the rest normally */
+		    printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
+                  }
+                  got_hi_surrogate = 0;
+                }
+		unsigned char unescaped_utf[4];
 		if (tok->ucs_char < 0x80) {
-		  utf_out[0] = tok->ucs_char;
-		  printbuf_memappend_fast(tok->pb, (char*)utf_out, 1);
+		  unescaped_utf[0] = tok->ucs_char;
+		  printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 1);
 		} else if (tok->ucs_char < 0x800) {
-		  utf_out[0] = 0xc0 | (tok->ucs_char >> 6);
-		  utf_out[1] = 0x80 | (tok->ucs_char & 0x3f);
-		  printbuf_memappend_fast(tok->pb, (char*)utf_out, 2);
+		  unescaped_utf[0] = 0xc0 | (tok->ucs_char >> 6);
+		  unescaped_utf[1] = 0x80 | (tok->ucs_char & 0x3f);
+		  printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 2);
+		} else if (IS_HIGH_SURROGATE(tok->ucs_char)) {
+                  /* Got a high surrogate.  Remember it and look for the
+                   * the beginning of another sequence, which should be the
+                   * low surrogate.
+                   */
+                  got_hi_surrogate = tok->ucs_char;
+                  /* Not at end, and the next two chars should be "\u" */
+                  if ((tok->char_offset+1 != len) &&
+                      (tok->char_offset+2 != len) &&
+                      (str[1] == '\\') &&
+                      (str[2] == 'u'))
+                  {
+	            ADVANCE_CHAR(str, tok);
+	            ADVANCE_CHAR(str, tok);
+
+                    /* Advance to the first char of the next sequence and
+                     * continue processing with the next sequence.
+                     */
+	            if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok)) {
+	              printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
+	              goto out;
+                    }
+	            tok->ucs_char = 0;
+                    tok->st_pos = 0;
+                    continue; /* other json_tokener_state_escape_unicode */
+                  } else {
+                    /* Got a high surrogate without another sequence following
+                     * it.  Put a replacement char in for the hi surrogate
+                     * and pretend we finished.
+                     */
+		    printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
+                  }
+		} else if (IS_LOW_SURROGATE(tok->ucs_char)) {
+                  /* Got a low surrogate not preceded by a high */
+		  printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
+                } else if (tok->ucs_char < 0x10000) {
+		  unescaped_utf[0] = 0xe0 | (tok->ucs_char >> 12);
+		  unescaped_utf[1] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
+		  unescaped_utf[2] = 0x80 | (tok->ucs_char & 0x3f);
+		  printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 3);
+		} else if (tok->ucs_char < 0x110000) {
+		  unescaped_utf[0] = 0xf0 | ((tok->ucs_char >> 18) & 0x07);
+		  unescaped_utf[1] = 0x80 | ((tok->ucs_char >> 12) & 0x3f);
+		  unescaped_utf[2] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
+		  unescaped_utf[3] = 0x80 | (tok->ucs_char & 0x3f);
+		  printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 4);
 		} else {
-		  utf_out[0] = 0xe0 | (tok->ucs_char >> 12);
-		  utf_out[1] = 0x80 | ((tok->ucs_char >> 6) & 0x3f);
-		  utf_out[2] = 0x80 | (tok->ucs_char & 0x3f);
-		  printbuf_memappend_fast(tok->pb, (char*)utf_out, 3);
-		}
+                  /* Don't know what we got--insert the replacement char */
+		  printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
+                }
 		state = saved_state;
 		break;
 	      }
 	    } else {
 	      tok->err = json_tokener_error_parse_string;
 	      goto out;
-	      	  }
-	  if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok))
+	    }
+	  if (!ADVANCE_CHAR(str, tok) || !POP_CHAR(c, tok)) {
+            if (got_hi_surrogate) /* Clean up any pending chars */
+	      printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);
 	    goto out;
+	  }
 	}
       }
       break;
Accepting request 58770 from home:computersalat:devel:tel OBS-URL: https://build.opensuse.org/request/show/58770 OBS-URL: https://build.opensuse.org/package/show/devel:libraries:c_c++/json-c?expand=0&rev=1 2011-02-09 17:23:18 +00:00			`diff -Naur json-c-0.9.old//json_tokener.c json-c-0.9//json_tokener.c`
			`--- json-c-0.9.old//json_tokener.c 2009-12-15 23:53:18.000000000 +0000`
			`+++ json-c-0.9//json_tokener.c 2009-12-16 00:52:40.000000000 +0000`
			`@@ -58,6 +58,11 @@`
			`"expected comment",`
			`};`

			`+/* Stuff for decoding unicode sequences */`
			`+#define IS_HIGH_SURROGATE(uc) (((uc) & 0xFC00) == 0xD800)`
			`+#define IS_LOW_SURROGATE(uc) (((uc) & 0xFC00) == 0xDC00)`
			`+#define DECODE_SURROGATE_PAIR(hi,lo) ((((hi) & 0x3FF) << 10) + ((lo) & 0x3FF) + 0x10000)`
			`+static unsigned char utf8_replacement_char[3] = { 0xEF, 0xBF, 0xBD };`

			`struct json_tokener* json_tokener_new(void)`
			`{`
			`@@ -398,40 +403,94 @@`
			`break;`

			`case json_tokener_state_escape_unicode:`
			`- /* Note that the following code is inefficient for handling large`
			`- * chunks of extended chars, calling printbuf_memappend() once`
			`- * for each multi-byte character of input.`
			`- * This is a good area for future optimization.`
			`- */`
			`{`
			`- /* Advance until we change state */`
			`+ unsigned int got_hi_surrogate = 0;`
			`+ /* Handle a 4-byte sequence, or two sequences if a surrogate pair */`
			`while(1) {`
			`if(strchr(json_hex_chars, c)) {`
			`tok->ucs_char += ((unsigned int)hexdigit(c) << ((3-tok->st_pos++)*4));`
			`if(tok->st_pos == 4) {`
			`- unsigned char utf_out[3];`
			`+ if (got_hi_surrogate) {`
			`+ if (IS_LOW_SURROGATE(tok->ucs_char)) {`
			`+ /* Recalculate the ucs_char, then fall thru to process normally */`
			`+ tok->ucs_char = DECODE_SURROGATE_PAIR(got_hi_surrogate, tok->ucs_char);`
			`+ } else {`
			`+ /* Hi surrogate was not followed by a low surrogate */`
			`+ /* Replace the hi and process the rest normally */`
			`+ printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);`
			`+ }`
			`+ got_hi_surrogate = 0;`
			`+ }`
			`+ unsigned char unescaped_utf[4];`
			`if (tok->ucs_char < 0x80) {`
			`- utf_out[0] = tok->ucs_char;`
			`- printbuf_memappend_fast(tok->pb, (char*)utf_out, 1);`
			`+ unescaped_utf[0] = tok->ucs_char;`
			`+ printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 1);`
			`} else if (tok->ucs_char < 0x800) {`
			`- utf_out[0] = 0xc0 \| (tok->ucs_char >> 6);`
			`- utf_out[1] = 0x80 \| (tok->ucs_char & 0x3f);`
			`- printbuf_memappend_fast(tok->pb, (char*)utf_out, 2);`
			`+ unescaped_utf[0] = 0xc0 \| (tok->ucs_char >> 6);`
			`+ unescaped_utf[1] = 0x80 \| (tok->ucs_char & 0x3f);`
			`+ printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 2);`
			`+ } else if (IS_HIGH_SURROGATE(tok->ucs_char)) {`
			`+ /* Got a high surrogate. Remember it and look for the`
			`+ * the beginning of another sequence, which should be the`
			`+ * low surrogate.`
			`+ */`
			`+ got_hi_surrogate = tok->ucs_char;`
			`+ /* Not at end, and the next two chars should be "\u" */`
			`+ if ((tok->char_offset+1 != len) &&`
			`+ (tok->char_offset+2 != len) &&`
			`+ (str[1] == '\\') &&`
			`+ (str[2] == 'u'))`
			`+ {`
			`+ ADVANCE_CHAR(str, tok);`
			`+ ADVANCE_CHAR(str, tok);`
			`+`
			`+ /* Advance to the first char of the next sequence and`
			`+ * continue processing with the next sequence.`
			`+ */`
			`+ if (!ADVANCE_CHAR(str, tok) \|\| !POP_CHAR(c, tok)) {`
			`+ printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);`
			`+ goto out;`
			`+ }`
			`+ tok->ucs_char = 0;`
			`+ tok->st_pos = 0;`
			`+ continue; /* other json_tokener_state_escape_unicode */`
			`+ } else {`
			`+ /* Got a high surrogate without another sequence following`
			`+ * it. Put a replacement char in for the hi surrogate`
			`+ * and pretend we finished.`
			`+ */`
			`+ printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);`
			`+ }`
			`+ } else if (IS_LOW_SURROGATE(tok->ucs_char)) {`
			`+ /* Got a low surrogate not preceded by a high */`
			`+ printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);`
			`+ } else if (tok->ucs_char < 0x10000) {`
			`+ unescaped_utf[0] = 0xe0 \| (tok->ucs_char >> 12);`
			`+ unescaped_utf[1] = 0x80 \| ((tok->ucs_char >> 6) & 0x3f);`
			`+ unescaped_utf[2] = 0x80 \| (tok->ucs_char & 0x3f);`
			`+ printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 3);`
			`+ } else if (tok->ucs_char < 0x110000) {`
			`+ unescaped_utf[0] = 0xf0 \| ((tok->ucs_char >> 18) & 0x07);`
			`+ unescaped_utf[1] = 0x80 \| ((tok->ucs_char >> 12) & 0x3f);`
			`+ unescaped_utf[2] = 0x80 \| ((tok->ucs_char >> 6) & 0x3f);`
			`+ unescaped_utf[3] = 0x80 \| (tok->ucs_char & 0x3f);`
			`+ printbuf_memappend_fast(tok->pb, (char*)unescaped_utf, 4);`
			`} else {`
			`- utf_out[0] = 0xe0 \| (tok->ucs_char >> 12);`
			`- utf_out[1] = 0x80 \| ((tok->ucs_char >> 6) & 0x3f);`
			`- utf_out[2] = 0x80 \| (tok->ucs_char & 0x3f);`
			`- printbuf_memappend_fast(tok->pb, (char*)utf_out, 3);`
			`- }`
			`+ /* Don't know what we got--insert the replacement char */`
			`+ printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);`
			`+ }`
			`state = saved_state;`
			`break;`
			`}`
			`} else {`
			`tok->err = json_tokener_error_parse_string;`
			`goto out;`
			`- }`
			`- if (!ADVANCE_CHAR(str, tok) \|\| !POP_CHAR(c, tok))`
			`+ }`
			`+ if (!ADVANCE_CHAR(str, tok) \|\| !POP_CHAR(c, tok)) {`
			`+ if (got_hi_surrogate) /* Clean up any pending chars */`
			`+ printbuf_memappend_fast(tok->pb, (char*)utf8_replacement_char, 3);`
			`goto out;`
			`+ }`
			`}`
			`}`
			`break;`