Forgotten files

2025-08-01 23:13:40 +02:00 · 2011-01-22 00:01:54 -05:00
parent 3f059a6a12
commit fb2809ec99
10 changed files with 2273 additions and 1001 deletions
--- a/glib/pcre/pcre.h
+++ b/glib/pcre/pcre.h
@@ -5,7 +5,7 @@
 /* This is the public header file for the PCRE library, to be #included by
 applications that call the PCRE functions.

-           Copyright (c) 1997-2009 University of Cambridge
+           Copyright (c) 1997-2010 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
 /* The current PCRE version information. */

 #define PCRE_MAJOR          8
-#define PCRE_MINOR          02
+#define PCRE_MINOR          12
 #define PCRE_PRERELEASE     
-#define PCRE_DATE           2010-03-19
+#define PCRE_DATE           2011-01-15

 /* When an application links to a PCRE DLL in Windows, the symbols that are
 imported have to be identified as such. When building PCRE, the appropriate
@@ -96,41 +96,44 @@ extern "C" {
 #endif

 /* Options. Some are compile-time only, some are run-time only, and some are
-both, so we keep them all distinct. */
+both, so we keep them all distinct. However, almost all the bits in the options
+word are now used. In the long run, we may have to re-use some of the
+compile-time only bits for runtime options, or vice versa. */

-#define PCRE_CASELESS           0x00000001
-#define PCRE_MULTILINE          0x00000002
-#define PCRE_DOTALL             0x00000004
-#define PCRE_EXTENDED           0x00000008
-#define PCRE_ANCHORED           0x00000010
-#define PCRE_DOLLAR_ENDONLY     0x00000020
-#define PCRE_EXTRA              0x00000040
-#define PCRE_NOTBOL             0x00000080
-#define PCRE_NOTEOL             0x00000100
-#define PCRE_UNGREEDY           0x00000200
-#define PCRE_NOTEMPTY           0x00000400
-#define PCRE_UTF8               0x00000800
-#define PCRE_NO_AUTO_CAPTURE    0x00001000
-#define PCRE_NO_UTF8_CHECK      0x00002000
-#define PCRE_AUTO_CALLOUT       0x00004000
-#define PCRE_PARTIAL_SOFT       0x00008000
+#define PCRE_CASELESS           0x00000001  /* Compile */
+#define PCRE_MULTILINE          0x00000002  /* Compile */
+#define PCRE_DOTALL             0x00000004  /* Compile */
+#define PCRE_EXTENDED           0x00000008  /* Compile */
+#define PCRE_ANCHORED           0x00000010  /* Compile, exec, DFA exec */
+#define PCRE_DOLLAR_ENDONLY     0x00000020  /* Compile */
+#define PCRE_EXTRA              0x00000040  /* Compile */
+#define PCRE_NOTBOL             0x00000080  /* Exec, DFA exec */
+#define PCRE_NOTEOL             0x00000100  /* Exec, DFA exec */
+#define PCRE_UNGREEDY           0x00000200  /* Compile */
+#define PCRE_NOTEMPTY           0x00000400  /* Exec, DFA exec */
+#define PCRE_UTF8               0x00000800  /* Compile */
+#define PCRE_NO_AUTO_CAPTURE    0x00001000  /* Compile */
+#define PCRE_NO_UTF8_CHECK      0x00002000  /* Compile, exec, DFA exec */
+#define PCRE_AUTO_CALLOUT       0x00004000  /* Compile */
+#define PCRE_PARTIAL_SOFT       0x00008000  /* Exec, DFA exec */
 #define PCRE_PARTIAL            0x00008000  /* Backwards compatible synonym */
-#define PCRE_DFA_SHORTEST       0x00010000
-#define PCRE_DFA_RESTART        0x00020000
-#define PCRE_FIRSTLINE          0x00040000
-#define PCRE_DUPNAMES           0x00080000
-#define PCRE_NEWLINE_CR         0x00100000
-#define PCRE_NEWLINE_LF         0x00200000
-#define PCRE_NEWLINE_CRLF       0x00300000
-#define PCRE_NEWLINE_ANY        0x00400000
-#define PCRE_NEWLINE_ANYCRLF    0x00500000
-#define PCRE_BSR_ANYCRLF        0x00800000
-#define PCRE_BSR_UNICODE        0x01000000
-#define PCRE_JAVASCRIPT_COMPAT  0x02000000
-#define PCRE_NO_START_OPTIMIZE  0x04000000
-#define PCRE_NO_START_OPTIMISE  0x04000000
-#define PCRE_PARTIAL_HARD       0x08000000
-#define PCRE_NOTEMPTY_ATSTART   0x10000000
+#define PCRE_DFA_SHORTEST       0x00010000  /* DFA exec */
+#define PCRE_DFA_RESTART        0x00020000  /* DFA exec */
+#define PCRE_FIRSTLINE          0x00040000  /* Compile */
+#define PCRE_DUPNAMES           0x00080000  /* Compile */
+#define PCRE_NEWLINE_CR         0x00100000  /* Compile, exec, DFA exec */
+#define PCRE_NEWLINE_LF         0x00200000  /* Compile, exec, DFA exec */
+#define PCRE_NEWLINE_CRLF       0x00300000  /* Compile, exec, DFA exec */
+#define PCRE_NEWLINE_ANY        0x00400000  /* Compile, exec, DFA exec */
+#define PCRE_NEWLINE_ANYCRLF    0x00500000  /* Compile, exec, DFA exec */
+#define PCRE_BSR_ANYCRLF        0x00800000  /* Compile, exec, DFA exec */
+#define PCRE_BSR_UNICODE        0x01000000  /* Compile, exec, DFA exec */
+#define PCRE_JAVASCRIPT_COMPAT  0x02000000  /* Compile */
+#define PCRE_NO_START_OPTIMIZE  0x04000000  /* Compile, exec, DFA exec */
+#define PCRE_NO_START_OPTIMISE  0x04000000  /* Synonym */
+#define PCRE_PARTIAL_HARD       0x08000000  /* Exec, DFA exec */
+#define PCRE_NOTEMPTY_ATSTART   0x10000000  /* Exec, DFA exec */
+#define PCRE_UCP                0x20000000  /* Compile */

 /* Exec-time and get/set-time error codes */

@@ -158,6 +161,8 @@ both, so we keep them all distinct. */
 #define PCRE_ERROR_RECURSIONLIMIT (-21)
 #define PCRE_ERROR_NULLWSLIMIT    (-22)  /* No longer actually used */
 #define PCRE_ERROR_BADNEWLINE     (-23)
+#define PCRE_ERROR_BADOFFSET      (-24)
+#define PCRE_ERROR_SHORTUTF8      (-25)

 /* Request types for pcre_fullinfo() */

@@ -200,6 +205,7 @@ these bits, just add new ones on the end, in order to remain compatible. */
 #define PCRE_EXTRA_CALLOUT_DATA           0x0004
 #define PCRE_EXTRA_TABLES                 0x0008
 #define PCRE_EXTRA_MATCH_LIMIT_RECURSION  0x0010
+#define PCRE_EXTRA_MARK                   0x0020

 /* Types */

@@ -225,6 +231,7 @@ typedef struct pcre_extra {
  void *callout_data;             /* Data passed back in callouts */
  const unsigned char *tables;    /* Pointer to character tables */
  unsigned long int match_limit_recursion; /* Max recursive calls to match() */
+  unsigned char **mark;           /* For passing back a mark pointer */
 } pcre_extra;

 /* The structure for passing out data via the pcre_callout_function. We use a
--- a/glib/pcre/pcre_chartables.c
+++ b/glib/pcre/pcre_chartables.c
@@ -14,7 +14,7 @@ example ISO-8859-1. When dftables is run, it creates these tables in the
 current locale. If PCRE is configured with --enable-rebuild-chartables, this
 happens automatically.

-The following #includes are present because without the gcc 4.x may remove the
+The following #includes are present because without them gcc 4.x may remove the
 array definition from the final binary if PCRE is built into a static library
 and dead code stripping is activated. This leads to link errors. Pulling in the
 header ensures that the array gets flagged as "someone outside this compilation
--- a/glib/pcre/pcre_compile.c
+++ b/glib/pcre/pcre_compile.c
--- a/glib/pcre/pcre_dfa_exec.c
+++ b/glib/pcre/pcre_dfa_exec.c
@@ -106,7 +106,7 @@ never stored, so we push them well clear of the normal opcodes. */


 /* This table identifies those opcodes that are followed immediately by a
-character that is to be tested in some way. This makes is possible to
+character that is to be tested in some way. This makes it possible to
 centralize the loading of these characters. In the case of Type * etc, the
 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
 small value. Non-zero values in the table are the offsets from the opcode where
@@ -161,8 +161,9 @@ static const uschar coptable[] = {
  0, 0,                          /* RREF, NRREF                            */
  0,                             /* DEF                                    */
  0, 0,                          /* BRAZERO, BRAMINZERO                    */
-  0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
-  0, 0, 0, 0                     /* FAIL, ACCEPT, CLOSE, SKIPZERO          */
+  0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG,                */
+  0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG,        */
+  0, 0, 0, 0, 0                  /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO  */
 };

 /* This table identifies those opcodes that inspect a character. It is used to
@@ -218,8 +219,9 @@ static const uschar poptable[] = {
  0, 0,                          /* RREF, NRREF                            */
  0,                             /* DEF                                    */
  0, 0,                          /* BRAZERO, BRAMINZERO                    */
-  0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
-  0, 0, 0, 0                     /* FAIL, ACCEPT, CLOSE, SKIPZERO          */
+  0, 0, 0,                       /* MARK, PRUNE, PRUNE_ARG,                */
+  0, 0, 0, 0,                    /* SKIP, SKIP_ARG, THEN, THEN_ARG,        */
+  0, 0, 0, 0, 0                  /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO  */
 };

 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
@@ -473,7 +475,7 @@ if (*first_op == OP_REVERSE)

    {
    gone_back = (current_subject - max_back < start_subject)?
-      current_subject - start_subject : max_back;
+      (int)(current_subject - start_subject) : max_back;
    current_subject -= gone_back;
    }

@@ -490,7 +492,7 @@ if (*first_op == OP_REVERSE)
    int back = GET(end_code, 2+LINK_SIZE);
    if (back <= gone_back)
      {
-      int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
+      int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
      ADD_NEW_DATA(-bstate, 0, gone_back - back);
      }
    end_code += GET(end_code, 1);
@@ -526,7 +528,7 @@ else
      ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
    do
      {
-      ADD_NEW(end_code - start_code + length, 0);
+      ADD_NEW((int)(end_code - start_code + length), 0);
      end_code += GET(end_code, 1);
      length = 1 + LINK_SIZE;
      }
@@ -753,8 +755,8 @@ for (;;)
          if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
          if (offsetcount >= 2)
            {
-            offsets[0] = current_subject - start_subject;
-            offsets[1] = ptr - start_subject;
+            offsets[0] = (int)(current_subject - start_subject);
+            offsets[1] = (int)(ptr - start_subject);
            DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
              offsets[1] - offsets[0], current_subject));
            }
@@ -776,7 +778,7 @@ for (;;)
      /*-----------------------------------------------------------------*/
      case OP_ALT:
      do { code += GET(code, 1); } while (*code == OP_ALT);
-      ADD_ACTIVE(code - start_code, 0);
+      ADD_ACTIVE((int)(code - start_code), 0);
      break;

      /*-----------------------------------------------------------------*/
@@ -784,7 +786,7 @@ for (;;)
      case OP_SBRA:
      do
        {
-        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
        code += GET(code, 1);
        }
      while (*code == OP_ALT);
@@ -793,11 +795,11 @@ for (;;)
      /*-----------------------------------------------------------------*/
      case OP_CBRA:
      case OP_SCBRA:
-      ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);
+      ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE),  0);
      code += GET(code, 1);
      while (*code == OP_ALT)
        {
-        ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);
+        ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE),  0);
        code += GET(code, 1);
        }
      break;
@@ -808,14 +810,14 @@ for (;;)
      ADD_ACTIVE(state_offset + 1, 0);
      code += 1 + GET(code, 2);
      while (*code == OP_ALT) code += GET(code, 1);
-      ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
      break;

      /*-----------------------------------------------------------------*/
      case OP_SKIPZERO:
      code += 1 + GET(code, 2);
      while (*code == OP_ALT) code += GET(code, 1);
-      ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
+      ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
      break;

      /*-----------------------------------------------------------------*/
@@ -829,7 +831,12 @@ for (;;)

      /*-----------------------------------------------------------------*/
      case OP_EOD:
-      if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
+      if (ptr >= end_subject)
+        {
+        if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
+          could_continue = TRUE;
+        else { ADD_ACTIVE(state_offset + 1, 0); }
+        }
      break;

      /*-----------------------------------------------------------------*/
@@ -869,7 +876,9 @@ for (;;)

      /*-----------------------------------------------------------------*/
      case OP_EODN:
-      if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
+      if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
+        could_continue = TRUE;
+      else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
        { ADD_ACTIVE(state_offset + 1, 0); }
      break;

@@ -877,7 +886,9 @@ for (;;)
      case OP_DOLL:
      if ((md->moptions & PCRE_NOTEOL) == 0)
        {
-        if (clen == 0 ||
+        if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
+          could_continue = TRUE;
+        else if (clen == 0 ||
            ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
               ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
            ))
@@ -920,13 +931,37 @@ for (;;)
          if (utf8) BACKCHAR(temp);
 #endif
          GETCHARTEST(d, temp);
+#ifdef SUPPORT_UCP
+          if ((md->poptions & PCRE_UCP) != 0)
+            {
+            if (d == '_') left_word = TRUE; else
+              {
+              int cat = UCD_CATEGORY(d);
+              left_word = (cat == ucp_L || cat == ucp_N);
+              }
+            }
+          else
+#endif
          left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
          }
-        else left_word = 0;
+        else left_word = FALSE;

        if (clen > 0)
+          {
+#ifdef SUPPORT_UCP
+          if ((md->poptions & PCRE_UCP) != 0)
+            {
+            if (c == '_') right_word = TRUE; else
+              {
+              int cat = UCD_CATEGORY(c);
+              right_word = (cat == ucp_L || cat == ucp_N);
+              }
+            }
+          else
+#endif
          right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
-        else right_word = 0;
+          }
+        else right_word = FALSE;

        if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
          { ADD_ACTIVE(state_offset + 1, 0); }
@@ -953,7 +988,8 @@ for (;;)
          break;

          case PT_LAMP:
-          OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+          OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+               chartype == ucp_Lt;
          break;

          case PT_GC:
@@ -968,6 +1004,30 @@ for (;;)
          OK = UCD_SCRIPT(c) == code[2];
          break;

+          /* These are specials for combination cases. */
+
+          case PT_ALNUM:
+          OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+               _pcre_ucp_gentype[chartype] == ucp_N;
+          break;
+
+          case PT_SPACE:    /* Perl space */
+          OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+               c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+          break;
+
+          case PT_PXSPACE:  /* POSIX space */
+          OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+               c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+               c == CHAR_FF || c == CHAR_CR;
+          break;
+
+          case PT_WORD:
+          OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+               _pcre_ucp_gentype[chartype] == ucp_N ||
+               c == CHAR_UNDERSCORE;
+          break;
+
          /* Should never occur, but keep compilers from grumbling. */

          default:
@@ -1122,7 +1182,8 @@ for (;;)
          break;

          case PT_LAMP:
-          OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+          OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+            chartype == ucp_Lt;
          break;

          case PT_GC:
@@ -1137,6 +1198,30 @@ for (;;)
          OK = UCD_SCRIPT(c) == code[3];
          break;

+          /* These are specials for combination cases. */
+
+          case PT_ALNUM:
+          OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+               _pcre_ucp_gentype[chartype] == ucp_N;
+          break;
+
+          case PT_SPACE:    /* Perl space */
+          OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+               c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+          break;
+
+          case PT_PXSPACE:  /* POSIX space */
+          OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+               c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+               c == CHAR_FF || c == CHAR_CR;
+          break;
+
+          case PT_WORD:
+          OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+               _pcre_ucp_gentype[chartype] == ucp_N ||
+               c == CHAR_UNDERSCORE;
+          break;
+
          /* Should never occur, but keep compilers from grumbling. */

          default:
@@ -1344,7 +1429,8 @@ for (;;)
          break;

          case PT_LAMP:
-          OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+          OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+            chartype == ucp_Lt;
          break;

          case PT_GC:
@@ -1359,6 +1445,30 @@ for (;;)
          OK = UCD_SCRIPT(c) == code[3];
          break;

+          /* These are specials for combination cases. */
+
+          case PT_ALNUM:
+          OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+               _pcre_ucp_gentype[chartype] == ucp_N;
+          break;
+
+          case PT_SPACE:    /* Perl space */
+          OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+               c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+          break;
+
+          case PT_PXSPACE:  /* POSIX space */
+          OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+               c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+               c == CHAR_FF || c == CHAR_CR;
+          break;
+
+          case PT_WORD:
+          OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+               _pcre_ucp_gentype[chartype] == ucp_N ||
+               c == CHAR_UNDERSCORE;
+          break;
+
          /* Should never occur, but keep compilers from grumbling. */

          default:
@@ -1591,7 +1701,8 @@ for (;;)
          break;

          case PT_LAMP:
-          OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
+          OK = chartype == ucp_Lu || chartype == ucp_Ll ||
+            chartype == ucp_Lt;
          break;

          case PT_GC:
@@ -1606,6 +1717,30 @@ for (;;)
          OK = UCD_SCRIPT(c) == code[5];
          break;

+          /* These are specials for combination cases. */
+
+          case PT_ALNUM:
+          OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+               _pcre_ucp_gentype[chartype] == ucp_N;
+          break;
+
+          case PT_SPACE:    /* Perl space */
+          OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+               c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
+          break;
+
+          case PT_PXSPACE:  /* POSIX space */
+          OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
+               c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+               c == CHAR_FF || c == CHAR_CR;
+          break;
+
+          case PT_WORD:
+          OK = _pcre_ucp_gentype[chartype] == ucp_L ||
+               _pcre_ucp_gentype[chartype] == ucp_N ||
+               c == CHAR_UNDERSCORE;
+          break;
+
          /* Should never occur, but keep compilers from grumbling. */

          default:
@@ -2233,7 +2368,7 @@ for (;;)
        points to the byte after the end of the class. If there is a
        quantifier, this is where it will be. */

-        next_state_offset = ecode - start_code;
+        next_state_offset = (int)(ecode - start_code);

        switch (*ecode)
          {
@@ -2304,7 +2439,7 @@ for (;;)
          md,                                   /* static match data */
          code,                                 /* this subexpression's code */
          ptr,                                  /* where we currently are */
-          ptr - start_subject,                  /* start offset */
+          (int)(ptr - start_subject),           /* start offset */
          local_offsets,                        /* offset vector */
          sizeof(local_offsets)/sizeof(int),    /* size of same */
          local_workspace,                      /* workspace vector */
@@ -2315,7 +2450,7 @@ for (;;)

        if (rc == PCRE_ERROR_DFA_UITEM) return rc;
        if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
-            { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
+            { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
        }
      break;

@@ -2342,9 +2477,9 @@ for (;;)
            cb.callout_number   = code[LINK_SIZE+2];
            cb.offset_vector    = offsets;
            cb.subject          = (PCRE_SPTR)start_subject;
-            cb.subject_length   = end_subject - start_subject;
-            cb.start_match      = current_subject - start_subject;
-            cb.current_position = ptr - start_subject;
+            cb.subject_length   = (int)(end_subject - start_subject);
+            cb.start_match      = (int)(current_subject - start_subject);
+            cb.current_position = (int)(ptr - start_subject);
            cb.pattern_position = GET(code, LINK_SIZE + 3);
            cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
            cb.capture_top      = 1;
@@ -2395,7 +2530,7 @@ for (;;)
            md,                                   /* fixed match data */
            asscode,                              /* this subexpression's code */
            ptr,                                  /* where we currently are */
-            ptr - start_subject,                  /* start offset */
+            (int)(ptr - start_subject),           /* start offset */
            local_offsets,                        /* offset vector */
            sizeof(local_offsets)/sizeof(int),    /* size of same */
            local_workspace,                      /* workspace vector */
@@ -2407,7 +2542,7 @@ for (;;)
          if (rc == PCRE_ERROR_DFA_UITEM) return rc;
          if ((rc >= 0) ==
                (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
-            { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
+            { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
          else
            { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
          }
@@ -2428,7 +2563,7 @@ for (;;)
          md,                                   /* fixed match data */
          start_code + GET(code, 1),            /* this subexpression's code */
          ptr,                                  /* where we currently are */
-          ptr - start_subject,                  /* start offset */
+          (int)(ptr - start_subject),           /* start offset */
          local_offsets,                        /* offset vector */
          sizeof(local_offsets)/sizeof(int),    /* size of same */
          local_workspace,                      /* workspace vector */
@@ -2480,7 +2615,7 @@ for (;;)
          md,                                   /* fixed match data */
          code,                                 /* this subexpression's code */
          ptr,                                  /* where we currently are */
-          ptr - start_subject,                  /* start offset */
+          (int)(ptr - start_subject),           /* start offset */
          local_offsets,                        /* offset vector */
          sizeof(local_offsets)/sizeof(int),    /* size of same */
          local_workspace,                      /* workspace vector */
@@ -2497,7 +2632,8 @@ for (;;)

          do { end_subpattern += GET(end_subpattern, 1); }
            while (*end_subpattern == OP_ALT);
-          next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
+          next_state_offset =
+            (int)(end_subpattern - start_code + LINK_SIZE + 1);

          /* If the end of this subpattern is KETRMAX or KETRMIN, we must
          arrange for the repeat state also to be added to the relevant list.
@@ -2505,7 +2641,7 @@ for (;;)

          repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
                                 *end_subpattern == OP_KETRMIN)?
-            end_subpattern - start_code - GET(end_subpattern, 1) : -1;
+            (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;

          /* If we have matched an empty string, add the next state at the
          current character pointer. This is important so that the duplicate
@@ -2569,9 +2705,9 @@ for (;;)
        cb.callout_number   = code[1];
        cb.offset_vector    = offsets;
        cb.subject          = (PCRE_SPTR)start_subject;
-        cb.subject_length   = end_subject - start_subject;
-        cb.start_match      = current_subject - start_subject;
-        cb.current_position = ptr - start_subject;
+        cb.subject_length   = (int)(end_subject - start_subject);
+        cb.start_match      = (int)(current_subject - start_subject);
+        cb.current_position = (int)(ptr - start_subject);
        cb.pattern_position = GET(code, 2);
        cb.next_item_length = GET(code, 2 + LINK_SIZE);
        cb.capture_top      = 1;
@@ -2617,13 +2753,13 @@ for (;;)
        ((md->moptions & PCRE_PARTIAL_SOFT) != 0 &&  /* Soft partial and */
         match_count < 0)                            /* no matches */
        ) &&                                         /* And... */
-        ptr >= end_subject &&                     /* Reached end of subject */
-        ptr > current_subject)                    /* Matched non-empty string */
+        ptr >= end_subject &&                  /* Reached end of subject */
+        ptr > md->start_used_ptr)              /* Inspected non-empty string */
      {
      if (offsetcount >= 2)
        {
-        offsets[0] = md->start_used_ptr - start_subject;
-        offsets[1] = end_subject - start_subject;
+        offsets[0] = (int)(md->start_used_ptr - start_subject);
+        offsets[1] = (int)(end_subject - start_subject);
        }
      match_count = PCRE_ERROR_PARTIAL;
      }
@@ -2708,6 +2844,7 @@ if (re == NULL || subject == NULL || workspace == NULL ||
   (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
+if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;

 /* We need to find the pointer to any study data before we test for byte
 flipping, so we scan the extra_data block first. This may set two fields in the
@@ -2826,16 +2963,14 @@ back the character offset. */
 #ifdef SUPPORT_UTF8
 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
  {
-  if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
-    return PCRE_ERROR_BADUTF8;
+  int tb;
+  if ((tb = _pcre_valid_utf8((uschar *)subject, length)) >= 0)
+    return (tb == length && (options & PCRE_PARTIAL_HARD) != 0)?
+      PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
  if (start_offset > 0 && start_offset < length)
    {
-    int tb = ((uschar *)subject)[start_offset];
-    if (tb > 127)
-      {
-      tb &= 0xc0;
-      if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
-      }
+    tb = ((USPTR)subject)[start_offset] & 0xc0;
+    if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
    }
  }
 #endif
@@ -2922,9 +3057,11 @@ for (;;)

    /* There are some optimizations that avoid running the match if a known
    starting point is not found. However, there is an option that disables
-    these, for testing and for ensuring that all callouts do actually occur. */
+    these, for testing and for ensuring that all callouts do actually occur.
+    The option can be set in the regex by (*NO_START_OPT) or passed in
+    match-time options. */

-    if ((options & PCRE_NO_START_OPTIMIZE) == 0)
+    if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
      {
      /* Advance to a known first byte. */

@@ -2982,8 +3119,16 @@ for (;;)
        while (current_subject < end_subject)
          {
          register unsigned int c = *current_subject;
-          if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
-            else break;
+          if ((start_bits[c/8] & (1 << (c&7))) == 0)
+            {
+            current_subject++;
+#ifdef SUPPORT_UTF8
+            if (utf8)
+              while(current_subject < end_subject &&
+                    (*current_subject & 0xc0) == 0x80) current_subject++;
+#endif
+            }
+          else break;
          }
        }
      }
--- a/glib/pcre/pcre_exec.c
+++ b/glib/pcre/pcre_exec.c
--- a/glib/pcre/pcre_internal.h
+++ b/glib/pcre/pcre_internal.h
@@ -408,9 +408,10 @@ capturing parenthesis numbers in back references. */

 /* When UTF-8 encoding is being used, a character is no longer just a single
 byte. The macros for character handling generate simple sequences when used in
-byte-mode, and more complicated ones for UTF-8 characters. BACKCHAR should
-never be called in byte mode. To make sure it can never even appear when UTF-8
-support is omitted, we don't even define it. */
+byte-mode, and more complicated ones for UTF-8 characters. GETCHARLENTEST is
+not used when UTF-8 is not supported, so it is not defined, and BACKCHAR should
+never be called in byte mode. To make sure they can never even appear when
+UTF-8 support is omitted, we don't even define them. */

 #ifndef SUPPORT_UTF8
 #define GETCHAR(c, eptr) c = *eptr;
@@ -418,43 +419,83 @@ support is omitted, we don't even define it. */
 #define GETCHARINC(c, eptr) c = *eptr++;
 #define GETCHARINCTEST(c, eptr) c = *eptr++;
 #define GETCHARLEN(c, eptr, len) c = *eptr;
+/* #define GETCHARLENTEST(c, eptr, len) */
 /* #define BACKCHAR(eptr) */

 #else   /* SUPPORT_UTF8 */

+/* These macros were originally written in the form of loops that used data
+from the tables whose names start with _pcre_utf8_table. They were rewritten by
+a user so as not to use loops, because in some environments this gives a
+significant performance advantage, and it seems never to do any harm. */
+
+/* Base macro to pick up the remaining bytes of a UTF-8 character, not
+advancing the pointer. */
+
+#define GETUTF8(c, eptr) \
+    { \
+    if ((c & 0x20) == 0) \
+      c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
+    else if ((c & 0x10) == 0) \
+      c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
+    else if ((c & 0x08) == 0) \
+      c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
+      ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
+    else if ((c & 0x04) == 0) \
+      c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
+          ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
+          (eptr[4] & 0x3f); \
+    else \
+      c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
+          ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
+          ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
+    }
+
 /* Get the next UTF-8 character, not advancing the pointer. This is called when
 we know we are in UTF-8 mode. */

 #define GETCHAR(c, eptr) \
  c = *eptr; \
-  if (c >= 0xc0) \
-    { \
-    int gcii; \
-    int gcaa = _pcre_utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
-    int gcss = 6*gcaa; \
-    c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
-    for (gcii = 1; gcii <= gcaa; gcii++) \
-      { \
-      gcss -= 6; \
-      c |= (eptr[gcii] & 0x3f) << gcss; \
-      } \
-    }
+  if (c >= 0xc0) GETUTF8(c, eptr);

 /* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
 pointer. */

 #define GETCHARTEST(c, eptr) \
  c = *eptr; \
-  if (utf8 && c >= 0xc0) \
+  if (utf8 && c >= 0xc0) GETUTF8(c, eptr);
+
+/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing
+the pointer. */
+
+#define GETUTF8INC(c, eptr) \
    { \
-    int gcii; \
-    int gcaa = _pcre_utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
-    int gcss = 6*gcaa; \
-    c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
-    for (gcii = 1; gcii <= gcaa; gcii++) \
+    if ((c & 0x20) == 0) \
+      c = ((c & 0x1f) << 6) | (*eptr++ & 0x3f); \
+    else if ((c & 0x10) == 0) \
      { \
-      gcss -= 6; \
-      c |= (eptr[gcii] & 0x3f) << gcss; \
+      c = ((c & 0x0f) << 12) | ((*eptr & 0x3f) << 6) | (eptr[1] & 0x3f); \
+      eptr += 2; \
+      } \
+    else if ((c & 0x08) == 0) \
+      { \
+      c = ((c & 0x07) << 18) | ((*eptr & 0x3f) << 12) | \
+          ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
+      eptr += 3; \
+      } \
+    else if ((c & 0x04) == 0) \
+      { \
+      c = ((c & 0x03) << 24) | ((*eptr & 0x3f) << 18) | \
+          ((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) << 6) | \
+          (eptr[3] & 0x3f); \
+      eptr += 4; \
+      } \
+    else \
+      { \
+      c = ((c & 0x01) << 30) | ((*eptr & 0x3f) << 24) | \
+          ((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) << 12) | \
+          ((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
+      eptr += 5; \
      } \
    }

@@ -463,31 +504,49 @@ know we are in UTF-8 mode. */

 #define GETCHARINC(c, eptr) \
  c = *eptr++; \
-  if (c >= 0xc0) \
-    { \
-    int gcaa = _pcre_utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
-    int gcss = 6*gcaa; \
-    c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
-    while (gcaa-- > 0) \
-      { \
-      gcss -= 6; \
-      c |= (*eptr++ & 0x3f) << gcss; \
-      } \
-    }
+  if (c >= 0xc0) GETUTF8INC(c, eptr);

-/* Get the next character, testing for UTF-8 mode, and advancing the pointer */
+/* Get the next character, testing for UTF-8 mode, and advancing the pointer.
+This is called when we don't know if we are in UTF-8 mode. */

 #define GETCHARINCTEST(c, eptr) \
  c = *eptr++; \
-  if (utf8 && c >= 0xc0) \
+  if (utf8 && c >= 0xc0) GETUTF8INC(c, eptr);
+
+/* Base macro to pick up the remaining bytes of a UTF-8 character, not
+advancing the pointer, incrementing the length. */
+
+#define GETUTF8LEN(c, eptr, len) \
    { \
-    int gcaa = _pcre_utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
-    int gcss = 6*gcaa; \
-    c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
-    while (gcaa-- > 0) \
+    if ((c & 0x20) == 0) \
      { \
-      gcss -= 6; \
-      c |= (*eptr++ & 0x3f) << gcss; \
+      c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
+      len++; \
+      } \
+    else if ((c & 0x10)  == 0) \
+      { \
+      c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
+      len += 2; \
+      } \
+    else if ((c & 0x08)  == 0) \
+      {\
+      c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
+          ((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
+      len += 3; \
+      } \
+    else if ((c & 0x04)  == 0) \
+      { \
+      c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
+          ((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
+          (eptr[4] & 0x3f); \
+      len += 4; \
+      } \
+    else \
+      {\
+      c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
+          ((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
+          ((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
+      len += 5; \
      } \
    }

@@ -496,39 +555,15 @@ if there are extra bytes. This is called when we know we are in UTF-8 mode. */

 #define GETCHARLEN(c, eptr, len) \
  c = *eptr; \
-  if (c >= 0xc0) \
-    { \
-    int gcii; \
-    int gcaa = _pcre_utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
-    int gcss = 6*gcaa; \
-    c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
-    for (gcii = 1; gcii <= gcaa; gcii++) \
-      { \
-      gcss -= 6; \
-      c |= (eptr[gcii] & 0x3f) << gcss; \
-      } \
-    len += gcaa; \
-    }
+  if (c >= 0xc0) GETUTF8LEN(c, eptr, len);

 /* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
 pointer, incrementing length if there are extra bytes. This is called when we
-know we are in UTF-8 mode. */
+do not know if we are in UTF-8 mode. */

 #define GETCHARLENTEST(c, eptr, len) \
  c = *eptr; \
-  if (utf8 && c >= 0xc0) \
-    { \
-    int gcii; \
-    int gcaa = _pcre_utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
-    int gcss = 6*gcaa; \
-    c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
-    for (gcii = 1; gcii <= gcaa; gcii++) \
-      { \
-      gcss -= 6; \
-      c |= (eptr[gcii] & 0x3f) << gcss; \
-      } \
-    len += gcaa; \
-    }
+  if (utf8 && c >= 0xc0) GETUTF8LEN(c, eptr, len);

 /* If the pointer is not at the start of a character, move it back until
 it is. This is called only in UTF-8 mode - we don't put a test within the macro
@@ -536,7 +571,7 @@ because almost all calls are already within a block of UTF-8 only code. */

 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--

-#endif
+#endif  /* SUPPORT_UTF8 */


 /* In case there is no definition of offsetof() provided - though any proper
@@ -580,7 +615,7 @@ time, run time, or study time, respectively. */
   PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
   PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
   PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
-   PCRE_JAVASCRIPT_COMPAT)
+   PCRE_JAVASCRIPT_COMPAT|PCRE_UCP|PCRE_NO_START_OPTIMIZE)

 #define PUBLIC_EXEC_OPTIONS \
  (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
@@ -620,7 +655,7 @@ variable-length repeat, or a anything other than literal characters. */
 environments where these macros are defined elsewhere. Unfortunately, there
 is no way to do the same for the typedef. */

-typedef gboolean  BOOL;
+typedef gboolean BOOL;

 /* If PCRE is to support UTF-8 on EBCDIC platforms, we cannot use normal
 character constants like '*' because the compiler would emit their EBCDIC code,
@@ -870,6 +905,7 @@ so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
 #define STRING_COMMIT0              "COMMIT\0"
 #define STRING_F0                   "F\0"
 #define STRING_FAIL0                "FAIL\0"
+#define STRING_MARK0                "MARK\0"
 #define STRING_PRUNE0               "PRUNE\0"
 #define STRING_SKIP0                "SKIP\0"
 #define STRING_THEN                 "THEN"
@@ -891,14 +927,16 @@ so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */

 #define STRING_DEFINE               "DEFINE"

-#define STRING_CR_RIGHTPAR          "CR)"
-#define STRING_LF_RIGHTPAR          "LF)"
-#define STRING_CRLF_RIGHTPAR        "CRLF)"
-#define STRING_ANY_RIGHTPAR         "ANY)"
-#define STRING_ANYCRLF_RIGHTPAR     "ANYCRLF)"
-#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
-#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
-#define STRING_UTF8_RIGHTPAR        "UTF8)"
+#define STRING_CR_RIGHTPAR             "CR)"
+#define STRING_LF_RIGHTPAR             "LF)"
+#define STRING_CRLF_RIGHTPAR           "CRLF)"
+#define STRING_ANY_RIGHTPAR            "ANY)"
+#define STRING_ANYCRLF_RIGHTPAR        "ANYCRLF)"
+#define STRING_BSR_ANYCRLF_RIGHTPAR    "BSR_ANYCRLF)"
+#define STRING_BSR_UNICODE_RIGHTPAR    "BSR_UNICODE)"
+#define STRING_UTF8_RIGHTPAR           "UTF8)"
+#define STRING_UCP_RIGHTPAR            "UCP)"
+#define STRING_NO_START_OPT_RIGHTPAR   "NO_START_OPT)"

 #else  /* SUPPORT_UTF8 */

@@ -1122,6 +1160,7 @@ only. */
 #define STRING_COMMIT0              STR_C STR_O STR_M STR_M STR_I STR_T "\0"
 #define STRING_F0                   STR_F "\0"
 #define STRING_FAIL0                STR_F STR_A STR_I STR_L "\0"
+#define STRING_MARK0                STR_M STR_A STR_R STR_K "\0"
 #define STRING_PRUNE0               STR_P STR_R STR_U STR_N STR_E "\0"
 #define STRING_SKIP0                STR_S STR_K STR_I STR_P "\0"
 #define STRING_THEN                 STR_T STR_H STR_E STR_N
@@ -1143,14 +1182,16 @@ only. */

 #define STRING_DEFINE               STR_D STR_E STR_F STR_I STR_N STR_E

-#define STRING_CR_RIGHTPAR          STR_C STR_R STR_RIGHT_PARENTHESIS
-#define STRING_LF_RIGHTPAR          STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_CRLF_RIGHTPAR        STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_ANY_RIGHTPAR         STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
-#define STRING_ANYCRLF_RIGHTPAR     STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
-#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
-#define STRING_UTF8_RIGHTPAR        STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
+#define STRING_CR_RIGHTPAR             STR_C STR_R STR_RIGHT_PARENTHESIS
+#define STRING_LF_RIGHTPAR             STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_CRLF_RIGHTPAR           STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_ANY_RIGHTPAR            STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
+#define STRING_ANYCRLF_RIGHTPAR        STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_BSR_ANYCRLF_RIGHTPAR    STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
+#define STRING_BSR_UNICODE_RIGHTPAR    STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
+#define STRING_UTF8_RIGHTPAR           STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
+#define STRING_UCP_RIGHTPAR            STR_U STR_C STR_P STR_RIGHT_PARENTHESIS
+#define STRING_NO_START_OPT_RIGHTPAR   STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS

 #endif  /* SUPPORT_UTF8 */

@@ -1183,9 +1224,13 @@ only. */

 #define PT_ANY        0    /* Any property - matches all chars */
 #define PT_LAMP       1    /* L& - the union of Lu, Ll, Lt */
-#define PT_GC         2    /* General characteristic (e.g. L) */
-#define PT_PC         3    /* Particular characteristic (e.g. Lu) */
+#define PT_GC         2    /* Specified general characteristic (e.g. L) */
+#define PT_PC         3    /* Specified particular characteristic (e.g. Lu) */
 #define PT_SC         4    /* Script (e.g. Han) */
+#define PT_ALNUM      5    /* Alphanumeric - the union of L and N */
+#define PT_SPACE      6    /* Perl space - Z plus 9,10,12,13 */
+#define PT_PXSPACE    7    /* POSIX space - Z plus 9,10,11,12,13 */
+#define PT_WORD       8    /* Word - L plus N plus underscore */

 /* Flag bits and data types for the extended class (OP_XCLASS) for classes that
 contain UTF-8 characters with values greater than 255. */
@@ -1202,9 +1247,15 @@ contain UTF-8 characters with values greater than 255. */
 /* These are escaped items that aren't just an encoding of a particular data
 value such as \n. They must have non-zero values, as check_escape() returns
 their negation. Also, they must appear in the same order as in the opcode
-definitions below, up to ESC_z. There's a dummy for OP_ANY because it
-corresponds to "." rather than an escape sequence, and another for OP_ALLANY
-(which is used for [^] in JavaScript compatibility mode).
+definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
+corresponds to "." in DOTALL mode rather than an escape sequence. It is also
+used for [^] in JavaScript compatibility mode. In non-DOTALL mode, "." behaves
+like \N.
+
+The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
+when PCRE_UCP is set, when replacement of \d etc by \p sequences is required.
+They must be contiguous, and remain in order so that the replacements can be
+looked up from a table.

 The final escape must be ESC_REF as subsequent values are used for
 backreferences (\1, \2, \3, etc). There are two tests in the code for an escape
@@ -1214,11 +1265,12 @@ put in between that don't consume a character, that code will have to change.
 */

 enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
-       ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
-       ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k,
+       ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
+       ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z,
+       ESC_E, ESC_Q, ESC_g, ESC_k,
+       ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu,
       ESC_REF };

-
 /* Opcode table: Starting from 1 (i.e. after OP_END), the values up to
 OP_EOD must correspond in order to the list of escapes immediately above.

@@ -1242,8 +1294,8 @@ enum {
  OP_WHITESPACE,         /*  9 \s */
  OP_NOT_WORDCHAR,       /* 10 \W */
  OP_WORDCHAR,           /* 11 \w */
-  OP_ANY,            /* 12 Match any character (subject to DOTALL) */
-  OP_ALLANY,         /* 13 Match any character (not subject to DOTALL) */
+  OP_ANY,            /* 12 Match any character except newline */
+  OP_ALLANY,         /* 13 Match any character */
  OP_ANYBYTE,        /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
  OP_NOTPROP,        /* 15 \P (not Unicode property) */
  OP_PROP,           /* 16 \p (Unicode property) */
@@ -1373,20 +1425,24 @@ enum {

  /* These are backtracking control verbs */

-  OP_PRUNE,          /* 107 */
-  OP_SKIP,           /* 108 */
-  OP_THEN,           /* 109 */
-  OP_COMMIT,         /* 110 */
+  OP_MARK,           /* 107 always has an argument */
+  OP_PRUNE,          /* 108 */
+  OP_PRUNE_ARG,      /* 109 same, but with argument */
+  OP_SKIP,           /* 110 */
+  OP_SKIP_ARG,       /* 111 same, but with argument */
+  OP_THEN,           /* 112 */
+  OP_THEN_ARG,       /* 113 same, but with argument */
+  OP_COMMIT,         /* 114 */

  /* These are forced failure and success verbs */

-  OP_FAIL,           /* 111 */
-  OP_ACCEPT,         /* 112 */
-  OP_CLOSE,          /* 113 Used before OP_ACCEPT to close open captures */
+  OP_FAIL,           /* 115 */
+  OP_ACCEPT,         /* 116 */
+  OP_CLOSE,          /* 117 Used before OP_ACCEPT to close open captures */

  /* This is used to skip a subpattern with a {0} quantifier */

-  OP_SKIPZERO,       /* 114 */
+  OP_SKIPZERO,       /* 118 */

  /* This is not an opcode, but is used to check that tables indexed by opcode
  are the correct length, in order to catch updating errors - there have been
@@ -1397,7 +1453,7 @@ enum {

 /* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
 definitions that follow must also be updated to match. There are also tables
-called "coptable" cna "poptable" in pcre_dfa_exec.c that must be updated. */
+called "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */


 /* This macro defines textual names for all the opcodes. These are used only
@@ -1422,7 +1478,8 @@ for debugging. The macro is referenced only in pcre_printint.c. */
  "Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond",        \
  "Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def",   \
  "Brazero", "Braminzero",                                        \
-  "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT",      \
+  "*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP",                  \
+  "*THEN", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT",                \
  "Close", "Skip zero"


@@ -1488,8 +1545,10 @@ in UTF-8 mode. The code that uses this table must know about such things. */
  3, 3,                          /* RREF, NRREF                            */ \
  1,                             /* DEF                                    */ \
  1, 1,                          /* BRAZERO, BRAMINZERO                    */ \
-  1, 1, 1, 1,                    /* PRUNE, SKIP, THEN, COMMIT,             */ \
-  1, 1, 3, 1                     /* FAIL, ACCEPT, CLOSE, SKIPZERO          */
+  3, 1, 3,                       /* MARK, PRUNE, PRUNE_ARG                 */ \
+  1, 3,                          /* SKIP, SKIP_ARG                         */ \
+  1+LINK_SIZE, 3+LINK_SIZE,      /* THEN, THEN_ARG                         */ \
+  1, 1, 1, 3, 1                  /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO  */


 /* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion"
@@ -1507,7 +1566,8 @@ enum { ERR0,  ERR1,  ERR2,  ERR3,  ERR4,  ERR5,  ERR6,  ERR7,  ERR8,  ERR9,
       ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
       ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
       ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
-       ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERRCOUNT };
+       ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68,
+       ERRCOUNT };

 /* The real format of the start of the pcre block; the index of names and the
 code vector run on as long as necessary after the end. We store an explicit
@@ -1650,6 +1710,7 @@ typedef struct match_data {
  BOOL   noteol;                /* NOTEOL flag */
  BOOL   utf8;                  /* UTF8 flag */
  BOOL   jscript_compat;        /* JAVASCRIPT_COMPAT flag */
+  BOOL   use_ucp;               /* PCRE_UCP flag */
  BOOL   endonly;               /* Dollar not before final \n */
  BOOL   notempty;              /* Empty string match not wanted */
  BOOL   notempty_atstart;      /* Empty string match at start not wanted */
@@ -1669,6 +1730,7 @@ typedef struct match_data {
  int    eptrn;                 /* Next free eptrblock */
  recursion_info *recursive;    /* Linked list of recursion data */
  void  *callout_data;          /* To pass back to callouts */
+  const uschar *mark;           /* Mark pointer to pass back */
 } match_data;

 /* A similar structure is used for the same purpose by the DFA matching
@@ -1764,7 +1826,7 @@ extern BOOL          _pcre_is_newline(USPTR, int, USPTR, int *, BOOL);
 extern int           _pcre_ord2utf8(int, uschar *);
 extern real_pcre    *_pcre_try_flipped(const real_pcre *, real_pcre *,
                       const pcre_study_data *, pcre_study_data *);
-#define              _pcre_valid_utf8(u, i) TRUE
+#define              _pcre_valid_utf8(USPTR, int) TRUE
 extern BOOL          _pcre_was_newline(USPTR, int, USPTR, int *, BOOL);
 extern BOOL          _pcre_xclass(int, const uschar *);

--- a/glib/pcre/pcre_study.c
+++ b/glib/pcre/pcre_study.c
@@ -48,6 +48,7 @@ supporting functions. */

 #include "pcre_internal.h"

+#define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))

 /* Returns from set_start_bits() */

@@ -413,6 +414,18 @@ for (;;)
 #endif
    break;

+    /* Skip these, but we need to add in the name length. */
+
+    case OP_MARK:
+    case OP_PRUNE_ARG:
+    case OP_SKIP_ARG:
+    cc += _pcre_OP_lengths[op] + cc[1];
+    break;
+
+    case OP_THEN_ARG:
+    cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
+    break;
+
    /* For the record, these are the opcodes that are matched by "default":
    OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
    OP_THEN. */
@@ -431,25 +444,121 @@ for (;;)
 *      Set a bit and maybe its alternate case    *
 *************************************************/

-/* Given a character, set its bit in the table, and also the bit for the other
-version of a letter if we are caseless.
+/* Given a character, set its first byte's bit in the table, and also the
+corresponding bit for the other version of a letter if we are caseless. In
+UTF-8 mode, for characters greater than 127, we can only do the caseless thing
+when Unicode property support is available.

 Arguments:
  start_bits    points to the bit map
-  c             is the character
+  p             points to the character
  caseless      the caseless flag
  cd            the block with char table pointers
+  utf8          TRUE for UTF-8 mode

-Returns:        nothing
+Returns:        pointer after the character
+*/
+
+static const uschar *
+set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
+  compile_data *cd, BOOL utf8)
+{
+unsigned int c = *p;
+
+SET_BIT(c);
+
+#ifdef SUPPORT_UTF8
+if (utf8 && c > 127)
+  {
+  GETCHARINC(c, p);
+#ifdef SUPPORT_UCP
+  if (caseless)
+    {
+    uschar buff[8];
+    c = UCD_OTHERCASE(c);
+    (void)_pcre_ord2utf8(c, buff);
+    SET_BIT(buff[0]);
+    }
+#endif
+  return p;
+  }
+#endif
+
+/* Not UTF-8 mode, or character is less than 127. */
+
+if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
+return p + 1;
+}
+
+
+
+/*************************************************
+*     Set bits for a positive character type     *
+*************************************************/
+
+/* This function sets starting bits for a character type. In UTF-8 mode, we can
+only do a direct setting for bytes less than 128, as otherwise there can be
+confusion with bytes in the middle of UTF-8 characters. In a "traditional"
+environment, the tables will only recognize ASCII characters anyway, but in at
+least one Windows environment, some higher bytes bits were set in the tables.
+So we deal with that case by considering the UTF-8 encoding.
+
+Arguments:
+  start_bits     the starting bitmap
+  cbit type      the type of character wanted
+  table_limit    32 for non-UTF-8; 16 for UTF-8
+  cd             the block with char table pointers
+
+Returns:         nothing
 */

 static void
-set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless,
+set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
  compile_data *cd)
 {
-start_bits[c/8] |= (1 << (c&7));
-if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
-  start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
+register int c;
+for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
+if (table_limit == 32) return;
+for (c = 128; c < 256; c++)
+  {
+  if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
+    {
+    uschar buff[8];
+    (void)_pcre_ord2utf8(c, buff);
+    SET_BIT(buff[0]);
+    }
+  }
+}
+
+
+/*************************************************
+*     Set bits for a negative character type     *
+*************************************************/
+
+/* This function sets starting bits for a negative character type such as \D.
+In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
+otherwise there can be confusion with bytes in the middle of UTF-8 characters.
+Unlike in the positive case, where we can set appropriate starting bits for
+specific high-valued UTF-8 characters, in this case we have to set the bits for
+all high-valued characters. The lowest is 0xc2, but we overkill by starting at
+0xc0 (192) for simplicity.
+
+Arguments:
+  start_bits     the starting bitmap
+  cbit type      the type of character wanted
+  table_limit    32 for non-UTF-8; 16 for UTF-8
+  cd             the block with char table pointers
+
+Returns:         nothing
+*/
+
+static void
+set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
+  compile_data *cd)
+{
+register int c;
+for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
+if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
 }


@@ -484,6 +593,7 @@ set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
 {
 register int c;
 int yield = SSB_DONE;
+int table_limit = utf8? 16:32;

 #if 0
 /* ========================================================================= */
@@ -607,12 +717,7 @@ do
      case OP_QUERY:
      case OP_MINQUERY:
      case OP_POSQUERY:
-      set_table_bit(start_bits, tcode[1], caseless, cd);
-      tcode += 2;
-#ifdef SUPPORT_UTF8
-      if (utf8 && tcode[-1] >= 0xc0)
-        tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
-#endif
+      tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
      break;

      /* Single-char upto sets the bit and tries the next */
@@ -620,12 +725,7 @@ do
      case OP_UPTO:
      case OP_MINUPTO:
      case OP_POSUPTO:
-      set_table_bit(start_bits, tcode[3], caseless, cd);
-      tcode += 4;
-#ifdef SUPPORT_UTF8
-      if (utf8 && tcode[-1] >= 0xc0)
-        tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
-#endif
+      tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
      break;

      /* At least one single char sets the bit and stops */
@@ -638,59 +738,86 @@ do
      case OP_PLUS:
      case OP_MINPLUS:
      case OP_POSPLUS:
-      set_table_bit(start_bits, tcode[1], caseless, cd);
+      (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
      try_next = FALSE;
      break;

-      /* Single character type sets the bits and stops */
+      /* Special spacing and line-terminating items. These recognize specific
+      lists of characters. The difference between VSPACE and ANYNL is that the
+      latter can match the two-character CRLF sequence, but that is not
+      relevant for finding the first character, so their code here is
+      identical. */
+
+      case OP_HSPACE:
+      SET_BIT(0x09);
+      SET_BIT(0x20);
+      if (utf8)
+        {
+        SET_BIT(0xC2);  /* For U+00A0 */
+        SET_BIT(0xE1);  /* For U+1680, U+180E */
+        SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
+        SET_BIT(0xE3);  /* For U+3000 */
+        }
+      else SET_BIT(0xA0);
+      try_next = FALSE;
+      break;
+
+      case OP_ANYNL:
+      case OP_VSPACE:
+      SET_BIT(0x0A);
+      SET_BIT(0x0B);
+      SET_BIT(0x0C);
+      SET_BIT(0x0D);
+      if (utf8)
+        {
+        SET_BIT(0xC2);  /* For U+0085 */
+        SET_BIT(0xE2);  /* For U+2028, U+2029 */
+        }
+      else SET_BIT(0x85);
+      try_next = FALSE;
+      break;
+
+      /* Single character types set the bits and stop. Note that if PCRE_UCP
+      is set, we do not see these op codes because \d etc are converted to
+      properties. Therefore, these apply in the case when only characters less
+      than 256 are recognized to match the types. */

      case OP_NOT_DIGIT:
-      for (c = 0; c < 32; c++)
-        start_bits[c] |= ~cd->cbits[c+cbit_digit];
+      set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
      try_next = FALSE;
      break;

      case OP_DIGIT:
-      for (c = 0; c < 32; c++)
-        start_bits[c] |= cd->cbits[c+cbit_digit];
+      set_type_bits(start_bits, cbit_digit, table_limit, cd);
      try_next = FALSE;
      break;

      /* The cbit_space table has vertical tab as whitespace; we have to
-      discard it. */
+      ensure it is set as not whitespace. */

      case OP_NOT_WHITESPACE:
-      for (c = 0; c < 32; c++)
-        {
-        int d = cd->cbits[c+cbit_space];
-        if (c == 1) d &= ~0x08;
-        start_bits[c] |= ~d;
-        }
+      set_nottype_bits(start_bits, cbit_space, table_limit, cd);
+      start_bits[1] |= 0x08;
      try_next = FALSE;
      break;

      /* The cbit_space table has vertical tab as whitespace; we have to
-      discard it. */
+      not set it from the table. */

      case OP_WHITESPACE:
-      for (c = 0; c < 32; c++)
-        {
-        int d = cd->cbits[c+cbit_space];
-        if (c == 1) d &= ~0x08;
-        start_bits[c] |= d;
-        }
+      c = start_bits[1];    /* Save in case it was already set */
+      set_type_bits(start_bits, cbit_space, table_limit, cd);
+      start_bits[1] = (start_bits[1] & ~0x08) | c;
      try_next = FALSE;
      break;

      case OP_NOT_WORDCHAR:
-      for (c = 0; c < 32; c++)
-        start_bits[c] |= ~cd->cbits[c+cbit_word];
+      set_nottype_bits(start_bits, cbit_word, table_limit, cd);
      try_next = FALSE;
      break;

      case OP_WORDCHAR:
-      for (c = 0; c < 32; c++)
-        start_bits[c] |= cd->cbits[c+cbit_word];
+      set_type_bits(start_bits, cbit_word, table_limit, cd);
      try_next = FALSE;
      break;

@@ -699,6 +826,7 @@ do

      case OP_TYPEPLUS:
      case OP_TYPEMINPLUS:
+      case OP_TYPEPOSPLUS:
      tcode++;
      break;

@@ -722,52 +850,69 @@ do
      case OP_TYPEPOSQUERY:
      switch(tcode[1])
        {
+        default:
        case OP_ANY:
        case OP_ALLANY:
        return SSB_FAIL;

+        case OP_HSPACE:
+        SET_BIT(0x09);
+        SET_BIT(0x20);
+        if (utf8)
+          {
+          SET_BIT(0xC2);  /* For U+00A0 */
+          SET_BIT(0xE1);  /* For U+1680, U+180E */
+          SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
+          SET_BIT(0xE3);  /* For U+3000 */
+          }
+        else SET_BIT(0xA0);
+        break;
+
+        case OP_ANYNL:
+        case OP_VSPACE:
+        SET_BIT(0x0A);
+        SET_BIT(0x0B);
+        SET_BIT(0x0C);
+        SET_BIT(0x0D);
+        if (utf8)
+          {
+          SET_BIT(0xC2);  /* For U+0085 */
+          SET_BIT(0xE2);  /* For U+2028, U+2029 */
+          }
+        else SET_BIT(0x85);
+        break;
+
        case OP_NOT_DIGIT:
-        for (c = 0; c < 32; c++)
-          start_bits[c] |= ~cd->cbits[c+cbit_digit];
+        set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
        break;

        case OP_DIGIT:
-        for (c = 0; c < 32; c++)
-          start_bits[c] |= cd->cbits[c+cbit_digit];
+        set_type_bits(start_bits, cbit_digit, table_limit, cd);
        break;

        /* The cbit_space table has vertical tab as whitespace; we have to
-        discard it. */
+        ensure it gets set as not whitespace. */

        case OP_NOT_WHITESPACE:
-        for (c = 0; c < 32; c++)
-          {
-          int d = cd->cbits[c+cbit_space];
-          if (c == 1) d &= ~0x08;
-          start_bits[c] |= ~d;
-          }
+        set_nottype_bits(start_bits, cbit_space, table_limit, cd);
+        start_bits[1] |= 0x08;
        break;

        /* The cbit_space table has vertical tab as whitespace; we have to
-        discard it. */
+        avoid setting it. */

        case OP_WHITESPACE:
-        for (c = 0; c < 32; c++)
-          {
-          int d = cd->cbits[c+cbit_space];
-          if (c == 1) d &= ~0x08;
-          start_bits[c] |= d;
-          }
+        c = start_bits[1];    /* Save in case it was already set */
+        set_type_bits(start_bits, cbit_space, table_limit, cd);
+        start_bits[1] = (start_bits[1] & ~0x08) | c;
        break;

        case OP_NOT_WORDCHAR:
-        for (c = 0; c < 32; c++)
-          start_bits[c] |= ~cd->cbits[c+cbit_word];
+        set_nottype_bits(start_bits, cbit_word, table_limit, cd);
        break;

        case OP_WORDCHAR:
-        for (c = 0; c < 32; c++)
-          start_bits[c] |= cd->cbits[c+cbit_word];
+        set_type_bits(start_bits, cbit_word, table_limit, cd);
        break;
        }

--- a/glib/pcre/pcre_tables.c
+++ b/glib/pcre/pcre_tables.c
@@ -123,8 +123,10 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
 #define STRING_Avestan0 STR_A STR_v STR_e STR_s STR_t STR_a STR_n "\0"
 #define STRING_Balinese0 STR_B STR_a STR_l STR_i STR_n STR_e STR_s STR_e "\0"
 #define STRING_Bamum0 STR_B STR_a STR_m STR_u STR_m "\0"
+#define STRING_Batak0 STR_B STR_a STR_t STR_a STR_k "\0"
 #define STRING_Bengali0 STR_B STR_e STR_n STR_g STR_a STR_l STR_i "\0"
 #define STRING_Bopomofo0 STR_B STR_o STR_p STR_o STR_m STR_o STR_f STR_o "\0"
+#define STRING_Brahmi0 STR_B STR_r STR_a STR_h STR_m STR_i "\0"
 #define STRING_Braille0 STR_B STR_r STR_a STR_i STR_l STR_l STR_e "\0"
 #define STRING_Buginese0 STR_B STR_u STR_g STR_i STR_n STR_e STR_s STR_e "\0"
 #define STRING_Buhid0 STR_B STR_u STR_h STR_i STR_d "\0"
@@ -184,6 +186,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
 #define STRING_Lu0 STR_L STR_u "\0"
 #define STRING_Lycian0 STR_L STR_y STR_c STR_i STR_a STR_n "\0"
 #define STRING_Lydian0 STR_L STR_y STR_d STR_i STR_a STR_n "\0"
+#define STRING_Mandaic0 STR_M STR_a STR_n STR_d STR_a STR_i STR_c "\0"
 #define STRING_M0 STR_M "\0"
 #define STRING_Malayalam0 STR_M STR_a STR_l STR_a STR_y STR_a STR_l STR_a STR_m "\0"
 #define STRING_Mc0 STR_M STR_c "\0"
@@ -243,6 +246,10 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
 #define STRING_Tifinagh0 STR_T STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0"
 #define STRING_Ugaritic0 STR_U STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0"
 #define STRING_Vai0 STR_V STR_a STR_i "\0"
+#define STRING_Xan0 STR_X STR_a STR_n "\0"
+#define STRING_Xps0 STR_X STR_p STR_s "\0"
+#define STRING_Xsp0 STR_X STR_s STR_p "\0"
+#define STRING_Xwd0 STR_X STR_w STR_d "\0"
 #define STRING_Yi0 STR_Y STR_i "\0"
 #define STRING_Z0 STR_Z "\0"
 #define STRING_Zl0 STR_Z STR_l "\0"
@@ -256,8 +263,10 @@ const char _pcre_utt_names[] =
  STRING_Avestan0
  STRING_Balinese0
  STRING_Bamum0
+  STRING_Batak0
  STRING_Bengali0
  STRING_Bopomofo0
+  STRING_Brahmi0
  STRING_Braille0
  STRING_Buginese0
  STRING_Buhid0
@@ -319,6 +328,7 @@ const char _pcre_utt_names[] =
  STRING_Lydian0
  STRING_M0
  STRING_Malayalam0
+  STRING_Mandaic0
  STRING_Mc0
  STRING_Me0
  STRING_Meetei_Mayek0
@@ -376,6 +386,10 @@ const char _pcre_utt_names[] =
  STRING_Tifinagh0
  STRING_Ugaritic0
  STRING_Vai0
+  STRING_Xan0
+  STRING_Xps0
+  STRING_Xsp0
+  STRING_Xwd0
  STRING_Yi0
  STRING_Z0
  STRING_Zl0
@@ -389,131 +403,138 @@ const ucp_type_table _pcre_utt[] = {
  {  20, PT_SC, ucp_Avestan },
  {  28, PT_SC, ucp_Balinese },
  {  37, PT_SC, ucp_Bamum },
-  {  43, PT_SC, ucp_Bengali },
-  {  51, PT_SC, ucp_Bopomofo },
-  {  60, PT_SC, ucp_Braille },
-  {  68, PT_SC, ucp_Buginese },
-  {  77, PT_SC, ucp_Buhid },
-  {  83, PT_GC, ucp_C },
-  {  85, PT_SC, ucp_Canadian_Aboriginal },
-  { 105, PT_SC, ucp_Carian },
-  { 112, PT_PC, ucp_Cc },
-  { 115, PT_PC, ucp_Cf },
-  { 118, PT_SC, ucp_Cham },
-  { 123, PT_SC, ucp_Cherokee },
-  { 132, PT_PC, ucp_Cn },
-  { 135, PT_PC, ucp_Co },
-  { 138, PT_SC, ucp_Common },
-  { 145, PT_SC, ucp_Coptic },
-  { 152, PT_PC, ucp_Cs },
-  { 155, PT_SC, ucp_Cuneiform },
-  { 165, PT_SC, ucp_Cypriot },
-  { 173, PT_SC, ucp_Cyrillic },
-  { 182, PT_SC, ucp_Deseret },
-  { 190, PT_SC, ucp_Devanagari },
-  { 201, PT_SC, ucp_Egyptian_Hieroglyphs },
-  { 222, PT_SC, ucp_Ethiopic },
-  { 231, PT_SC, ucp_Georgian },
-  { 240, PT_SC, ucp_Glagolitic },
-  { 251, PT_SC, ucp_Gothic },
-  { 258, PT_SC, ucp_Greek },
-  { 264, PT_SC, ucp_Gujarati },
-  { 273, PT_SC, ucp_Gurmukhi },
-  { 282, PT_SC, ucp_Han },
-  { 286, PT_SC, ucp_Hangul },
-  { 293, PT_SC, ucp_Hanunoo },
-  { 301, PT_SC, ucp_Hebrew },
-  { 308, PT_SC, ucp_Hiragana },
-  { 317, PT_SC, ucp_Imperial_Aramaic },
-  { 334, PT_SC, ucp_Inherited },
-  { 344, PT_SC, ucp_Inscriptional_Pahlavi },
-  { 366, PT_SC, ucp_Inscriptional_Parthian },
-  { 389, PT_SC, ucp_Javanese },
-  { 398, PT_SC, ucp_Kaithi },
-  { 405, PT_SC, ucp_Kannada },
-  { 413, PT_SC, ucp_Katakana },
-  { 422, PT_SC, ucp_Kayah_Li },
-  { 431, PT_SC, ucp_Kharoshthi },
-  { 442, PT_SC, ucp_Khmer },
-  { 448, PT_GC, ucp_L },
-  { 450, PT_LAMP, 0 },
-  { 453, PT_SC, ucp_Lao },
-  { 457, PT_SC, ucp_Latin },
-  { 463, PT_SC, ucp_Lepcha },
-  { 470, PT_SC, ucp_Limbu },
-  { 476, PT_SC, ucp_Linear_B },
-  { 485, PT_SC, ucp_Lisu },
-  { 490, PT_PC, ucp_Ll },
-  { 493, PT_PC, ucp_Lm },
-  { 496, PT_PC, ucp_Lo },
-  { 499, PT_PC, ucp_Lt },
-  { 502, PT_PC, ucp_Lu },
-  { 505, PT_SC, ucp_Lycian },
-  { 512, PT_SC, ucp_Lydian },
-  { 519, PT_GC, ucp_M },
-  { 521, PT_SC, ucp_Malayalam },
-  { 531, PT_PC, ucp_Mc },
-  { 534, PT_PC, ucp_Me },
-  { 537, PT_SC, ucp_Meetei_Mayek },
-  { 550, PT_PC, ucp_Mn },
-  { 553, PT_SC, ucp_Mongolian },
-  { 563, PT_SC, ucp_Myanmar },
-  { 571, PT_GC, ucp_N },
-  { 573, PT_PC, ucp_Nd },
-  { 576, PT_SC, ucp_New_Tai_Lue },
-  { 588, PT_SC, ucp_Nko },
-  { 592, PT_PC, ucp_Nl },
-  { 595, PT_PC, ucp_No },
-  { 598, PT_SC, ucp_Ogham },
-  { 604, PT_SC, ucp_Ol_Chiki },
-  { 613, PT_SC, ucp_Old_Italic },
-  { 624, PT_SC, ucp_Old_Persian },
-  { 636, PT_SC, ucp_Old_South_Arabian },
-  { 654, PT_SC, ucp_Old_Turkic },
-  { 665, PT_SC, ucp_Oriya },
-  { 671, PT_SC, ucp_Osmanya },
-  { 679, PT_GC, ucp_P },
-  { 681, PT_PC, ucp_Pc },
-  { 684, PT_PC, ucp_Pd },
-  { 687, PT_PC, ucp_Pe },
-  { 690, PT_PC, ucp_Pf },
-  { 693, PT_SC, ucp_Phags_Pa },
-  { 702, PT_SC, ucp_Phoenician },
-  { 713, PT_PC, ucp_Pi },
-  { 716, PT_PC, ucp_Po },
-  { 719, PT_PC, ucp_Ps },
-  { 722, PT_SC, ucp_Rejang },
-  { 729, PT_SC, ucp_Runic },
-  { 735, PT_GC, ucp_S },
-  { 737, PT_SC, ucp_Samaritan },
-  { 747, PT_SC, ucp_Saurashtra },
-  { 758, PT_PC, ucp_Sc },
-  { 761, PT_SC, ucp_Shavian },
-  { 769, PT_SC, ucp_Sinhala },
-  { 777, PT_PC, ucp_Sk },
-  { 780, PT_PC, ucp_Sm },
-  { 783, PT_PC, ucp_So },
-  { 786, PT_SC, ucp_Sundanese },
-  { 796, PT_SC, ucp_Syloti_Nagri },
-  { 809, PT_SC, ucp_Syriac },
-  { 816, PT_SC, ucp_Tagalog },
-  { 824, PT_SC, ucp_Tagbanwa },
-  { 833, PT_SC, ucp_Tai_Le },
-  { 840, PT_SC, ucp_Tai_Tham },
-  { 849, PT_SC, ucp_Tai_Viet },
-  { 858, PT_SC, ucp_Tamil },
-  { 864, PT_SC, ucp_Telugu },
-  { 871, PT_SC, ucp_Thaana },
-  { 878, PT_SC, ucp_Thai },
-  { 883, PT_SC, ucp_Tibetan },
-  { 891, PT_SC, ucp_Tifinagh },
-  { 900, PT_SC, ucp_Ugaritic },
-  { 909, PT_SC, ucp_Vai },
-  { 913, PT_SC, ucp_Yi },
-  { 916, PT_GC, ucp_Z },
-  { 918, PT_PC, ucp_Zl },
-  { 921, PT_PC, ucp_Zp },
-  { 924, PT_PC, ucp_Zs }
+  {  43, PT_SC, ucp_Batak },
+  {  49, PT_SC, ucp_Bengali },
+  {  57, PT_SC, ucp_Bopomofo },
+  {  66, PT_SC, ucp_Brahmi },
+  {  73, PT_SC, ucp_Braille },
+  {  81, PT_SC, ucp_Buginese },
+  {  90, PT_SC, ucp_Buhid },
+  {  96, PT_GC, ucp_C },
+  {  98, PT_SC, ucp_Canadian_Aboriginal },
+  { 118, PT_SC, ucp_Carian },
+  { 125, PT_PC, ucp_Cc },
+  { 128, PT_PC, ucp_Cf },
+  { 131, PT_SC, ucp_Cham },
+  { 136, PT_SC, ucp_Cherokee },
+  { 145, PT_PC, ucp_Cn },
+  { 148, PT_PC, ucp_Co },
+  { 151, PT_SC, ucp_Common },
+  { 158, PT_SC, ucp_Coptic },
+  { 165, PT_PC, ucp_Cs },
+  { 168, PT_SC, ucp_Cuneiform },
+  { 178, PT_SC, ucp_Cypriot },
+  { 186, PT_SC, ucp_Cyrillic },
+  { 195, PT_SC, ucp_Deseret },
+  { 203, PT_SC, ucp_Devanagari },
+  { 214, PT_SC, ucp_Egyptian_Hieroglyphs },
+  { 235, PT_SC, ucp_Ethiopic },
+  { 244, PT_SC, ucp_Georgian },
+  { 253, PT_SC, ucp_Glagolitic },
+  { 264, PT_SC, ucp_Gothic },
+  { 271, PT_SC, ucp_Greek },
+  { 277, PT_SC, ucp_Gujarati },
+  { 286, PT_SC, ucp_Gurmukhi },
+  { 295, PT_SC, ucp_Han },
+  { 299, PT_SC, ucp_Hangul },
+  { 306, PT_SC, ucp_Hanunoo },
+  { 314, PT_SC, ucp_Hebrew },
+  { 321, PT_SC, ucp_Hiragana },
+  { 330, PT_SC, ucp_Imperial_Aramaic },
+  { 347, PT_SC, ucp_Inherited },
+  { 357, PT_SC, ucp_Inscriptional_Pahlavi },
+  { 379, PT_SC, ucp_Inscriptional_Parthian },
+  { 402, PT_SC, ucp_Javanese },
+  { 411, PT_SC, ucp_Kaithi },
+  { 418, PT_SC, ucp_Kannada },
+  { 426, PT_SC, ucp_Katakana },
+  { 435, PT_SC, ucp_Kayah_Li },
+  { 444, PT_SC, ucp_Kharoshthi },
+  { 455, PT_SC, ucp_Khmer },
+  { 461, PT_GC, ucp_L },
+  { 463, PT_LAMP, 0 },
+  { 466, PT_SC, ucp_Lao },
+  { 470, PT_SC, ucp_Latin },
+  { 476, PT_SC, ucp_Lepcha },
+  { 483, PT_SC, ucp_Limbu },
+  { 489, PT_SC, ucp_Linear_B },
+  { 498, PT_SC, ucp_Lisu },
+  { 503, PT_PC, ucp_Ll },
+  { 506, PT_PC, ucp_Lm },
+  { 509, PT_PC, ucp_Lo },
+  { 512, PT_PC, ucp_Lt },
+  { 515, PT_PC, ucp_Lu },
+  { 518, PT_SC, ucp_Lycian },
+  { 525, PT_SC, ucp_Lydian },
+  { 532, PT_GC, ucp_M },
+  { 534, PT_SC, ucp_Malayalam },
+  { 544, PT_SC, ucp_Mandaic },
+  { 552, PT_PC, ucp_Mc },
+  { 555, PT_PC, ucp_Me },
+  { 558, PT_SC, ucp_Meetei_Mayek },
+  { 571, PT_PC, ucp_Mn },
+  { 574, PT_SC, ucp_Mongolian },
+  { 584, PT_SC, ucp_Myanmar },
+  { 592, PT_GC, ucp_N },
+  { 594, PT_PC, ucp_Nd },
+  { 597, PT_SC, ucp_New_Tai_Lue },
+  { 609, PT_SC, ucp_Nko },
+  { 613, PT_PC, ucp_Nl },
+  { 616, PT_PC, ucp_No },
+  { 619, PT_SC, ucp_Ogham },
+  { 625, PT_SC, ucp_Ol_Chiki },
+  { 634, PT_SC, ucp_Old_Italic },
+  { 645, PT_SC, ucp_Old_Persian },
+  { 657, PT_SC, ucp_Old_South_Arabian },
+  { 675, PT_SC, ucp_Old_Turkic },
+  { 686, PT_SC, ucp_Oriya },
+  { 692, PT_SC, ucp_Osmanya },
+  { 700, PT_GC, ucp_P },
+  { 702, PT_PC, ucp_Pc },
+  { 705, PT_PC, ucp_Pd },
+  { 708, PT_PC, ucp_Pe },
+  { 711, PT_PC, ucp_Pf },
+  { 714, PT_SC, ucp_Phags_Pa },
+  { 723, PT_SC, ucp_Phoenician },
+  { 734, PT_PC, ucp_Pi },
+  { 737, PT_PC, ucp_Po },
+  { 740, PT_PC, ucp_Ps },
+  { 743, PT_SC, ucp_Rejang },
+  { 750, PT_SC, ucp_Runic },
+  { 756, PT_GC, ucp_S },
+  { 758, PT_SC, ucp_Samaritan },
+  { 768, PT_SC, ucp_Saurashtra },
+  { 779, PT_PC, ucp_Sc },
+  { 782, PT_SC, ucp_Shavian },
+  { 790, PT_SC, ucp_Sinhala },
+  { 798, PT_PC, ucp_Sk },
+  { 801, PT_PC, ucp_Sm },
+  { 804, PT_PC, ucp_So },
+  { 807, PT_SC, ucp_Sundanese },
+  { 817, PT_SC, ucp_Syloti_Nagri },
+  { 830, PT_SC, ucp_Syriac },
+  { 837, PT_SC, ucp_Tagalog },
+  { 845, PT_SC, ucp_Tagbanwa },
+  { 854, PT_SC, ucp_Tai_Le },
+  { 861, PT_SC, ucp_Tai_Tham },
+  { 870, PT_SC, ucp_Tai_Viet },
+  { 879, PT_SC, ucp_Tamil },
+  { 885, PT_SC, ucp_Telugu },
+  { 892, PT_SC, ucp_Thaana },
+  { 899, PT_SC, ucp_Thai },
+  { 904, PT_SC, ucp_Tibetan },
+  { 912, PT_SC, ucp_Tifinagh },
+  { 921, PT_SC, ucp_Ugaritic },
+  { 930, PT_SC, ucp_Vai },
+  { 934, PT_ALNUM, 0 },
+  { 938, PT_PXSPACE, 0 },
+  { 942, PT_SPACE, 0 },
+  { 946, PT_WORD, 0 },
+  { 950, PT_SC, ucp_Yi },
+  { 953, PT_GC, ucp_Z },
+  { 955, PT_PC, ucp_Zl },
+  { 958, PT_PC, ucp_Zp },
+  { 961, PT_PC, ucp_Zs }
 };

 const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table);
--- a/glib/pcre/pcre_xclass.c
+++ b/glib/pcre/pcre_xclass.c
@@ -6,7 +6,7 @@
 and semantics are as close as possible to those of the Perl 5 language.

                       Written by Philip Hazel
-           Copyright (c) 1997-2009 University of Cambridge
+           Copyright (c) 1997-2010 University of Cambridge

 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -104,6 +104,7 @@ while ((t = *data++) != XCL_END)
  else  /* XCL_PROP & XCL_NOTPROP */
    {
    int chartype = UCD_CHARTYPE(c);
+
    switch(*data)
      {
      case PT_ANY:
@@ -111,12 +112,13 @@ while ((t = *data++) != XCL_END)
      break;

      case PT_LAMP:
-      if ((chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt) ==
-          (t == XCL_PROP)) return !negated;
+      if ((chartype == ucp_Lu || chartype == ucp_Ll ||
+           chartype == ucp_Lt) == (t == XCL_PROP)) return !negated;
      break;

      case PT_GC:
-      if ((data[1] == _pcre_ucp_gentype[chartype]) == (t == XCL_PROP)) return !negated;
+      if ((data[1] == _pcre_ucp_gentype[chartype]) == (t == XCL_PROP))
+        return !negated;
      break;

      case PT_PC:
@@ -127,6 +129,33 @@ while ((t = *data++) != XCL_END)
      if ((data[1] == UCD_SCRIPT(c)) == (t == XCL_PROP)) return !negated;
      break;

+      case PT_ALNUM:
+      if ((_pcre_ucp_gentype[chartype] == ucp_L ||
+           _pcre_ucp_gentype[chartype] == ucp_N) == (t == XCL_PROP))
+        return !negated;
+      break;
+
+      case PT_SPACE:    /* Perl space */
+      if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
+           c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
+             == (t == XCL_PROP))
+        return !negated;
+      break;
+
+      case PT_PXSPACE:  /* POSIX space */
+      if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
+           c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
+           c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP))
+        return !negated;
+      break;
+
+      case PT_WORD:
+      if ((_pcre_ucp_gentype[chartype] == ucp_L ||
+           _pcre_ucp_gentype[chartype] == ucp_N || c == CHAR_UNDERSCORE)
+             == (t == XCL_PROP))
+        return !negated;
+      break;
+
      /* This should never occur, but compilers may mutter if there is no
      default. */

--- a/glib/pcre/ucp.h
+++ b/glib/pcre/ucp.h
@@ -150,7 +150,10 @@ enum {
  ucp_Old_Turkic = G_UNICODE_SCRIPT_OLD_TURKIC,
  ucp_Samaritan = G_UNICODE_SCRIPT_SAMARITAN,
  ucp_Tai_Tham = G_UNICODE_SCRIPT_TAI_THAM,
-  ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET
+  ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET,
+  ucp_Batak = G_UNICODE_SCRIPT_BATAK,
+  ucp_Brahmi = G_UNICODE_SCRIPT_BRAHMI,
+  ucp_Mandaic = G_UNICODE_SCRIPT_MANDAIC
 };

 #endif