regex: Import PCRE 8.31

https://bugzilla.gnome.org/show_bug.cgi?id=679193
2025-08-22 08:58:54 +02:00 · 2012-06-14 22:15:27 +02:00
parent f66052fc87
commit 9457833010
12 changed files with 2090 additions and 1127 deletions
--- a/glib/pcre/pcre_dfa_exec.c
+++ b/glib/pcre/pcre_dfa_exec.c
@@ -38,10 +38,9 @@ POSSIBILITY OF SUCH DAMAGE.
 -----------------------------------------------------------------------------
 */

-
 /* This module contains the external function pcre_dfa_exec(), which is an
 alternative matching function that uses a sort of DFA algorithm (not a true
-FSM). This is NOT Perl- compatible, but it has advantages in certain
+FSM). This is NOT Perl-compatible, but it has advantages in certain
 applications. */


@@ -282,7 +281,7 @@ typedef struct stateblock {
  int data;                       /* Some use extra data */
 } stateblock;

-#define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
+#define INTS_PER_STATEBLOCK  (int)(sizeof(stateblock)/sizeof(int))


 #ifdef PCRE_DEBUG
@@ -382,7 +381,8 @@ for the current character, one for the following character). */
    next_new_state->count  = (y); \
    next_new_state->data   = (z); \
    next_new_state++; \
-    DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
+    DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
+      (x), (y), (z), __LINE__)); \
    } \
  else return PCRE_ERROR_DFA_WSSIZE

@@ -424,6 +424,8 @@ BOOL utf = (md->poptions & PCRE_UTF8) != 0;
 BOOL utf = FALSE;
 #endif

+BOOL reset_could_continue = FALSE;
+
 rlevel++;
 offsetcount &= (-2);

@@ -571,7 +573,9 @@ for (;;)
  int clen, dlen;
  unsigned int c, d;
  int forced_fail = 0;
-  BOOL could_continue = FALSE;
+  BOOL partial_newline = FALSE;
+  BOOL could_continue = reset_could_continue;
+  reset_could_continue = FALSE;

  /* Make the new state list into the active state list and empty the
  new state list. */
@@ -607,7 +611,7 @@ for (;;)

  if (ptr < end_subject)
    {
-    clen = 1;        /* Number of bytes in the character */
+    clen = 1;        /* Number of data items in the character */
 #ifdef SUPPORT_UTF
    if (utf) { GETCHARLEN(c, ptr, clen); } else
 #endif  /* SUPPORT_UTF */
@@ -641,7 +645,8 @@ for (;;)

    /* A negative offset is a special case meaning "hold off going to this
    (negated) state until the number of characters in the data field have
-    been skipped". */
+    been skipped". If the could_continue flag was passed over from a previous
+    state, arrange for it to passed on. */

    if (state_offset < 0)
      {
@@ -650,6 +655,7 @@ for (;;)
        DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
        ADD_NEW_DATA(state_offset, current_state->count,
          current_state->data - 1);
+        if (could_continue) reset_could_continue = TRUE;
        continue;
        }
      else
@@ -689,10 +695,10 @@ for (;;)
    permitted.

    We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
-    argument that is not a data character - but is always one byte long. We
-    have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
-    this case. To keep the other cases fast, convert these ones to new opcodes.
-    */
+    argument that is not a data character - but is always one byte long because
+    the values are small. We have to take special action to deal with  \P, \p,
+    \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
+    these ones to new opcodes. */

    if (coptable[codevalue] > 0)
      {
@@ -783,7 +789,7 @@ for (;;)
            offsets[0] = (int)(current_subject - start_subject);
            offsets[1] = (int)(ptr - start_subject);
            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
-              offsets[1] - offsets[0], current_subject));
+              offsets[1] - offsets[0], (char *)current_subject));
            }
          if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
            {
@@ -888,7 +894,20 @@ for (;;)
      /*-----------------------------------------------------------------*/
      case OP_ANY:
      if (clen > 0 && !IS_NEWLINE(ptr))
-        { ADD_NEW(state_offset + 1, 0); }
+        {
+        if (ptr + 1 >= md->end_subject &&
+            (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
+            NLBLOCK->nltype == NLTYPE_FIXED &&
+            NLBLOCK->nllen == 2 &&
+            c == NLBLOCK->nl[0])
+          {
+          could_continue = partial_newline = TRUE;
+          }
+        else
+          {
+          ADD_NEW(state_offset + 1, 0);
+          }
+        }
      break;

      /*-----------------------------------------------------------------*/
@@ -916,6 +935,19 @@ for (;;)
               (ptr == end_subject - md->nllen)
            ))
          { ADD_ACTIVE(state_offset + 1, 0); }
+        else if (ptr + 1 >= md->end_subject &&
+                 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
+                 NLBLOCK->nltype == NLTYPE_FIXED &&
+                 NLBLOCK->nllen == 2 &&
+                 c == NLBLOCK->nl[0])
+          {
+          if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
+            {
+            reset_could_continue = TRUE;
+            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
+            }
+          else could_continue = partial_newline = TRUE;
+          }
        }
      break;

@@ -928,6 +960,19 @@ for (;;)
        else if (clen == 0 ||
            ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
          { ADD_ACTIVE(state_offset + 1, 0); }
+        else if (ptr + 1 >= md->end_subject &&
+                 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
+                 NLBLOCK->nltype == NLTYPE_FIXED &&
+                 NLBLOCK->nllen == 2 &&
+                 c == NLBLOCK->nl[0])
+          {
+          if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
+            {
+            reset_could_continue = TRUE;
+            ADD_NEW_DATA(-(state_offset + 1), 0, 1);
+            }
+          else could_continue = partial_newline = TRUE;
+          }
        }
      else if (IS_NEWLINE(ptr))
        { ADD_ACTIVE(state_offset + 1, 0); }
@@ -1090,7 +1135,15 @@ for (;;)
      if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
      if (clen > 0)
        {
-        if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
+        if (d == OP_ANY && ptr + 1 >= md->end_subject &&
+            (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
+            NLBLOCK->nltype == NLTYPE_FIXED &&
+            NLBLOCK->nllen == 2 &&
+            c == NLBLOCK->nl[0])
+          {
+          could_continue = partial_newline = TRUE;
+          }
+        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
            (c < 256 &&
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
@@ -1113,7 +1166,15 @@ for (;;)
      ADD_ACTIVE(state_offset + 2, 0);
      if (clen > 0)
        {
-        if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
+        if (d == OP_ANY && ptr + 1 >= md->end_subject &&
+            (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
+            NLBLOCK->nltype == NLTYPE_FIXED &&
+            NLBLOCK->nllen == 2 &&
+            c == NLBLOCK->nl[0])
+          {
+          could_continue = partial_newline = TRUE;
+          }
+        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
            (c < 256 &&
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
@@ -1135,7 +1196,15 @@ for (;;)
      ADD_ACTIVE(state_offset + 2, 0);
      if (clen > 0)
        {
-        if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
+        if (d == OP_ANY && ptr + 1 >= md->end_subject &&
+            (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
+            NLBLOCK->nltype == NLTYPE_FIXED &&
+            NLBLOCK->nllen == 2 &&
+            c == NLBLOCK->nl[0])
+          {
+          could_continue = partial_newline = TRUE;
+          }
+        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
            (c < 256 &&
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
@@ -1155,7 +1224,15 @@ for (;;)
      count = current_state->count;  /* Number already matched */
      if (clen > 0)
        {
-        if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
+        if (d == OP_ANY && ptr + 1 >= md->end_subject &&
+            (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
+            NLBLOCK->nltype == NLTYPE_FIXED &&
+            NLBLOCK->nllen == 2 &&
+            c == NLBLOCK->nl[0])
+          {
+          could_continue = partial_newline = TRUE;
+          }
+        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
            (c < 256 &&
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
@@ -1176,7 +1253,15 @@ for (;;)
      count = current_state->count;  /* Number already matched */
      if (clen > 0)
        {
-        if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
+        if (d == OP_ANY && ptr + 1 >= md->end_subject &&
+            (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
+            NLBLOCK->nltype == NLTYPE_FIXED &&
+            NLBLOCK->nllen == 2 &&
+            c == NLBLOCK->nl[0])
+          {
+          could_continue = partial_newline = TRUE;
+          }
+        else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
            (c < 256 &&
              (d != OP_ANY || !IS_NEWLINE(ptr)) &&
              ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
@@ -1824,6 +1909,8 @@ for (;;)
          ncount++;
          nptr += ndlen;
          }
+        if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
+            reset_could_continue = TRUE;
        if (++count >= GET2(code, 1))
          { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
        else
@@ -2037,6 +2124,8 @@ for (;;)
          ncount++;
          nptr += nclen;
          }
+        if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
+            reset_could_continue = TRUE;
        ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
        }
      break;
@@ -2062,7 +2151,13 @@ for (;;)
        break;

        case 0x000d:
-        if (ptr + 1 < end_subject && ptr[1] == 0x0a)
+        if (ptr + 1 >= end_subject)
+          {
+          ADD_NEW(state_offset + 1, 0);
+          if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
+            reset_could_continue = TRUE;
+          }
+        else if (ptr[1] == 0x0a)
          {
          ADD_NEW_DATA(-(state_offset + 1), 0, 1);
          }
@@ -2171,22 +2266,32 @@ for (;;)
      break;

      /*-----------------------------------------------------------------*/
-      /* Match a negated single character casefully. This is only used for
-      one-byte characters, that is, we know that d < 256. The character we are
-      checking (c) can be multibyte. */
+      /* Match a negated single character casefully. */

      case OP_NOT:
      if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
      break;

      /*-----------------------------------------------------------------*/
-      /* Match a negated single character caselessly. This is only used for
-      one-byte characters, that is, we know that d < 256. The character we are
-      checking (c) can be multibyte. */
+      /* Match a negated single character caselessly. */

      case OP_NOTI:
-      if (clen > 0 && c != d && c != fcc[d])
-        { ADD_NEW(state_offset + dlen + 1, 0); }
+      if (clen > 0)
+        {
+        unsigned int otherd;
+#ifdef SUPPORT_UTF
+        if (utf && d >= 128)
+          {
+#ifdef SUPPORT_UCP
+          otherd = UCD_OTHERCASE(d);
+#endif  /* SUPPORT_UCP */
+          }
+        else
+#endif  /* SUPPORT_UTF */
+        otherd = TABLE_GET(d, fcc, d);
+        if (c != d && c != otherd)
+          { ADD_NEW(state_offset + dlen + 1, 0); }
+        }
      break;

      /*-----------------------------------------------------------------*/
@@ -2692,9 +2797,12 @@ for (;;)
            {
            int charcount = local_offsets[rc+1] - local_offsets[rc];
 #ifdef SUPPORT_UTF
-            const pcre_uchar *p = start_subject + local_offsets[rc];
-            const pcre_uchar *pp = start_subject + local_offsets[rc+1];
-            while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+            if (utf)
+              {
+              const pcre_uchar *p = start_subject + local_offsets[rc];
+              const pcre_uchar *pp = start_subject + local_offsets[rc+1];
+              while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+              }
 #endif
            if (charcount > 0)
              {
@@ -2793,7 +2901,7 @@ for (;;)
            const pcre_uchar *pp = local_ptr;
            charcount = (int)(pp - p);
 #ifdef SUPPORT_UTF
-            while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+            if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
 #endif
            ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
            }
@@ -2875,9 +2983,12 @@ for (;;)
          else
            {
 #ifdef SUPPORT_UTF
-            const pcre_uchar *p = start_subject + local_offsets[0];
-            const pcre_uchar *pp = start_subject + local_offsets[1];
-            while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+            if (utf)
+              {
+              const pcre_uchar *p = start_subject + local_offsets[0];
+              const pcre_uchar *pp = start_subject + local_offsets[1];
+              while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
+              }
 #endif
            ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
            if (repeat_state_offset >= 0)
@@ -2946,7 +3057,7 @@ for (;;)
  if (new_count <= 0)
    {
    if (rlevel == 1 &&                               /* Top level, and */
-        could_continue &&                            /* Some could go on */
+        could_continue &&                            /* Some could go on, and */
        forced_fail != workspace[1] &&               /* Not all forced fail & */
        (                                            /* either... */
        (md->moptions & PCRE_PARTIAL_HARD) != 0      /* Hard partial */
@@ -2954,8 +3065,13 @@ for (;;)
        ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
         match_count < 0)                            /* no matches */
        ) &&                                         /* And... */
-        ptr >= end_subject &&                  /* Reached end of subject */
-        ptr > md->start_used_ptr)              /* Inspected non-empty string */
+        (
+        partial_newline ||                           /* Either partial NL */
+          (                                          /* or ... */
+          ptr >= end_subject &&                /* End of subject and */
+          ptr > md->start_used_ptr)            /* Inspected non-empty string */
+          )
+        )
      {
      if (offsetcount >= 2)
        {
@@ -3052,10 +3168,27 @@ if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;

-/* We need to find the pointer to any study data before we test for byte
-flipping, so we scan the extra_data block first. This may set two fields in the
-match block, so we must initialize them beforehand. However, the other fields
-in the match block must not be set until after the byte flipping. */
+/* Check that the first field in the block is the magic number. If it is not,
+return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
+REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
+means that the pattern is likely compiled with different endianness. */
+
+if (re->magic_number != MAGIC_NUMBER)
+  return re->magic_number == REVERSED_MAGIC_NUMBER?
+    PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
+if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
+
+/* If restarting after a partial match, do some sanity checks on the contents
+of the workspace. */
+
+if ((options & PCRE_DFA_RESTART) != 0)
+  {
+  if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
+    workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
+      return PCRE_ERROR_DFA_BADRESTART;
+  }
+
+/* Set up study, callout, and table data */

 md->tables = re->tables;
 md->callout_data = NULL;
@@ -3074,16 +3207,6 @@ if (extra_data != NULL)
    md->tables = extra_data->tables;
  }

-/* Check that the first field in the block is the magic number. If it is not,
-return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
-REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
-means that the pattern is likely compiled with different endianness. */
-
-if (re->magic_number != MAGIC_NUMBER)
-  return re->magic_number == REVERSED_MAGIC_NUMBER?
-    PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
-if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
-
 /* Set some local values */

 current_subject = (const pcre_uchar *)subject + start_offset;