From e3e936f7ba244c77fe2f66c69b6fd35e9a1cfcf6 Mon Sep 17 00:00:00 2001
From: Philip Withnall <pwithnall@gnome.org>
Date: Tue, 15 Oct 2024 11:45:59 +0100
Subject: [PATCH 1/2] gdatainputstream: Use memchr() for the multi-stop-char
 case too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a follow up to commit e7e5ddd2a. oss-fuzz found a case where
performance was pathologically bad with a long `stop_chars` string.
Since our inner loop in that case was iterating over `stop_chars` and
comparing each of them to `buffer[i]`, we can use `memchr()` the
opposite way round to in commit e7e5ddd2a to speed that up, using
`buffer[i]` as the needle in a `stop_chars` haystack.

From some brief testing, this doesn’t impact on the performance of a
more normal use case of having a short (<10 bytes long) `stop_chars`. I
was slightly concerned that the function call overhead of calling out to
`memchr()` would have an impact there, but apparently not.

Signed-off-by: Philip Withnall <pwithnall@gnome.org>

oss-fuzz#372994443
---
 gio/gdatainputstream.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/gio/gdatainputstream.c b/gio/gdatainputstream.c
index ef728f006..bf2e71cd4 100644
--- a/gio/gdatainputstream.c
+++ b/gio/gdatainputstream.c
@@ -861,11 +861,8 @@ scan_for_chars (GDataInputStream *stream,
   gsize start, end, peeked;
   gsize i;
   gsize available, checked;
-  const char *stop_char;
-  const char *stop_end;
 
   bstream = G_BUFFERED_INPUT_STREAM (stream);
-  stop_end = stop_chars + stop_chars_len;
 
   checked = *checked_out;
 
@@ -874,8 +871,8 @@ scan_for_chars (GDataInputStream *stream,
   end = available;
   peeked = end - start;
 
-  /* For single-char case such as \0, defer to memchr which can
-   * take advantage of simd/etc.
+  /* For single-char case such as \0, defer the entire operation to memchr which
+   * can take advantage of simd/etc.
    */
   if (stop_chars_len == 1)
     {
@@ -888,11 +885,13 @@ scan_for_chars (GDataInputStream *stream,
     {
       for (i = 0; checked < available && i < peeked; i++)
         {
-          for (stop_char = stop_chars; stop_char != stop_end; stop_char++)
-            {
-              if (buffer[i] == *stop_char)
-                return (start + i);
-            }
+          /* We can use memchr() the other way round. Less fast than the
+           * single-char case above, but still faster than doing our own inner
+           * loop. */
+          const char *p = memchr (stop_chars, buffer[i], stop_chars_len);
+
+          if (p != NULL)
+            return (start + i);
         }
     }
 

From 50ccb04c71c95966a40849b21be98b587ef60248 Mon Sep 17 00:00:00 2001
From: Philip Withnall <pwithnall@gnome.org>
Date: Tue, 15 Oct 2024 11:49:33 +0100
Subject: [PATCH 2/2] tests: Fix 1-byte overread in data-input-stream tests

Commit 760a6f647 rearranged how the lengths are calculated for the test
data and added `escape_data_string()` so they could be printed safely.

Unfortunately there was a miscount in the length of the first test
vector in `test_read_upto()`: there are 31 bytes in the string literal,
plus one nul terminator which is added by the compiler. The quoted
string length was 32 bytes. This should be fine (explicitly including
the nul delimiter), but then `escape_data_string()` adds another byte to
the length because it assumes the nul delimiter has *not* been included
in the count.

Changing the string length from 32 to 31 breaks the tests, as the final
component of the data is then the wrong length, so add an additional
explicit nul byte to the string literal so that it matches the length.

Signed-off-by: Philip Withnall <pwithnall@gnome.org>
---
 gio/tests/data-input-stream.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gio/tests/data-input-stream.c b/gio/tests/data-input-stream.c
index 11c997bce..163231e1c 100644
--- a/gio/tests/data-input-stream.c
+++ b/gio/tests/data-input-stream.c
@@ -314,7 +314,7 @@ test_read_upto (void)
     const char *data_sep;
     size_t data_sep_len;
   } vectors[] = {
-    { 10, " part1 # part2 $ part3 \0 part4 ", 32, 7, "#$\0^", 4 },
+    { 10, " part1 # part2 $ part3 \0 part4 \0", 32, 7, "#$\0^", 4 },
     { 20, "{\"key\": \"value\"}\0", 17, 16, "\0", 1 },
   };