svn path=/trunk/; revision=6938
This commit is contained in:
Matthias Clasen 2008-05-27 04:17:54 +00:00
parent c9db84f9f2
commit adae23350a
11 changed files with 501 additions and 297 deletions

View File

@ -1,3 +1,7 @@
2008-05-27 Matthias Clasen <mclasen@redhat.com>
* glib/pcre/*: Update to PCRE 7.7
2008-05-26 Matthias Clasen <mclasen@redhat.com> 2008-05-26 Matthias Clasen <mclasen@redhat.com>
* glib/gchecksum.c: Add Since: tag to g_checksum_reset * glib/gchecksum.c: Add Since: tag to g_checksum_reset

View File

@ -1,35 +1,30 @@
TOP = ..\..\.. TOP = ..\..\..
!INCLUDE ..\..\build\win32\make.msc !INCLUDE ..\..\build\win32\make.msc
INCLUDES = \\ INCLUDES = \
-I ..\.. \\ -I ..\.. \
-I .. -I ..
DEFINES = \\ DEFINES = \
-DPCRE_STATIC \\ -DPCRE_STATIC \
-DHAVE_CONFIG_H \\ -DHAVE_CONFIG_H \
-DHAVE_LONG_LONG_FORMAT \\ -DHAVE_LONG_LONG_FORMAT \
-DSUPPORT_UCP \\ -DSUPPORT_UCP \
-DSUPPORT_UTF8 \\ -DSUPPORT_UTF8 \
-DNEWLINE=-1 \\ -DNEWLINE=-1 \
-DMATCH_LIMIT=10000000 \\ -DMATCH_LIMIT=10000000 \
-DMATCH_LIMIT_RECURSION=10000000 \\ -DMATCH_LIMIT_RECURSION=10000000 \
-DMAX_NAME_SIZE=32 \\ -DMAX_NAME_SIZE=32 \
-DMAX_NAME_COUNT=10000 \\ -DMAX_NAME_COUNT=10000 \
-DMAX_DUPLENGTH=30000 \\ -DMAX_DUPLENGTH=30000 \
-DLINK_SIZE=2 \\ -DLINK_SIZE=2 \
-DEBCDIC=0 \\ -DEBCDIC=0 \
-DPOSIX_MALLOC_THRESHOLD=10 -DPOSIX_MALLOC_THRESHOLD=10
OBJECTS = \\ OBJECTS = \
`
for f in $all_files; do
echo " $f.obj \\\\"
done
`
all : pcre.lib all : pcre.lib
pcre.lib : \$(OBJECTS) pcre.lib : $(OBJECTS)
lib -out:pcre.lib \$(OBJECTS) lib -out:pcre.lib $(OBJECTS)

View File

@ -42,10 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
/* The current PCRE version information. */ /* The current PCRE version information. */
#define PCRE_MAJOR 7 #define PCRE_MAJOR 7
#define PCRE_MINOR 7
#define PCRE_MINOR 6
#define PCRE_PRERELEASE #define PCRE_PRERELEASE
#define PCRE_DATE 2008-01-28 #define PCRE_DATE 2008-05-07
/* When an application links to a PCRE DLL in Windows, the symbols that are /* When an application links to a PCRE DLL in Windows, the symbols that are
imported have to be identified as such. When building PCRE, the appropriate imported have to be identified as such. When building PCRE, the appropriate
@ -125,6 +124,7 @@ extern "C" {
#define PCRE_NEWLINE_ANYCRLF 0x00500000 #define PCRE_NEWLINE_ANYCRLF 0x00500000
#define PCRE_BSR_ANYCRLF 0x00800000 #define PCRE_BSR_ANYCRLF 0x00800000
#define PCRE_BSR_UNICODE 0x01000000 #define PCRE_BSR_UNICODE 0x01000000
#define PCRE_JAVASCRIPT_COMPAT 0x02000000
/* Exec-time and get/set-time error codes */ /* Exec-time and get/set-time error codes */

View File

@ -1,3 +1,6 @@
/* This file is autogenerated by ../update-pcre/update.sh during
* the update of the local copy of PCRE.
*/
/************************************************* /*************************************************
* Perl-Compatible Regular Expressions * * Perl-Compatible Regular Expressions *
*************************************************/ *************************************************/

View File

@ -158,7 +158,7 @@ static const char verbnames[] =
"SKIP\0" "SKIP\0"
"THEN"; "THEN";
static verbitem verbs[] = { static const verbitem verbs[] = {
{ 6, OP_ACCEPT }, { 6, OP_ACCEPT },
{ 6, OP_COMMIT }, { 6, OP_COMMIT },
{ 1, OP_FAIL }, { 1, OP_FAIL },
@ -168,7 +168,7 @@ static verbitem verbs[] = {
{ 4, OP_THEN } { 4, OP_THEN }
}; };
static int verbcount = sizeof(verbs)/sizeof(verbitem); static const int verbcount = sizeof(verbs)/sizeof(verbitem);
/* Tables of names of POSIX character classes and their lengths. The names are /* Tables of names of POSIX character classes and their lengths. The names are
@ -295,14 +295,15 @@ static const char error_texts[] =
/* 55 */ /* 55 */
"repeating a DEFINE group is not allowed\0" "repeating a DEFINE group is not allowed\0"
"inconsistent NEWLINE options\0" "inconsistent NEWLINE options\0"
"\\g is not followed by a braced name or an optionally braced non-zero number\0" "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
"(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0" "a numbered reference must not be zero\0"
"(*VERB) with an argument is not supported\0" "(*VERB) with an argument is not supported\0"
/* 60 */ /* 60 */
"(*VERB) not recognized\0" "(*VERB) not recognized\0"
"number is too big\0" "number is too big\0"
"subpattern name expected\0" "subpattern name expected\0"
"digit expected after (?+"; "digit expected after (?+\0"
"] is an invalid data character in JavaScript compatibility mode";
/* Definition to allow mutual recursion */ /* Definition to allow mutual recursion */
@ -378,9 +379,15 @@ if (c == 0) *errorcodeptr = ERR1;
in a table. A non-zero result is something that can be returned immediately. in a table. A non-zero result is something that can be returned immediately.
Otherwise further processing may be required. */ Otherwise further processing may be required. */
#ifndef EBCDIC /* ASCII coding */
else if (c < '0' || c > 'z') {} /* Not alphanumeric */ else if (c < '0' || c > 'z') {} /* Not alphanumeric */
else if ((i = escapes[c - '0']) != 0) c = i; else if ((i = escapes[c - '0']) != 0) c = i;
#else /* EBCDIC coding */
else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
else if ((i = escapes[c - 0x48]) != 0) c = i;
#endif
/* Escapes that need further processing, or are illegal. */ /* Escapes that need further processing, or are illegal. */
else else
@ -401,14 +408,31 @@ else
*errorcodeptr = ERR37; *errorcodeptr = ERR37;
break; break;
/* \g must be followed by a number, either plain or braced. If positive, it /* \g must be followed by one of a number of specific things:
is an absolute backreference. If negative, it is a relative backreference.
This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a (1) A number, either plain or braced. If positive, it is an absolute
reference to a named group. This is part of Perl's movement towards a backreference. If negative, it is a relative backreference. This is a Perl
unified syntax for back references. As this is synonymous with \k{name}, we 5.10 feature.
fudge it up by pretending it really was \k. */
(2) Perl 5.10 also supports \g{name} as a reference to a named group. This
is part of Perl's movement towards a unified syntax for back references. As
this is synonymous with \k{name}, we fudge it up by pretending it really
was \k.
(3) For Oniguruma compatibility we also support \g followed by a name or a
number either in angle brackets or in single quotes. However, these are
(possibly recursive) subroutine calls, _not_ backreferences. Just return
the -ESC_g code (cf \k). */
case 'g': case 'g':
if (ptr[1] == '<' || ptr[1] == '\'')
{
c = -ESC_g;
break;
}
/* Handle the Perl-compatible cases */
if (ptr[1] == '{') if (ptr[1] == '{')
{ {
const uschar *p; const uschar *p;
@ -435,18 +459,24 @@ else
while (g_ascii_isdigit(ptr[1]) != 0) while (g_ascii_isdigit(ptr[1]) != 0)
c = c * 10 + *(++ptr) - '0'; c = c * 10 + *(++ptr) - '0';
if (c < 0) if (c < 0) /* Integer overflow */
{ {
*errorcodeptr = ERR61; *errorcodeptr = ERR61;
break; break;
} }
if (c == 0 || (braced && *(++ptr) != '}')) if (braced && *(++ptr) != '}')
{ {
*errorcodeptr = ERR57; *errorcodeptr = ERR57;
break; break;
} }
if (c == 0)
{
*errorcodeptr = ERR58;
break;
}
if (negated) if (negated)
{ {
if (c > bracount) if (c > bracount)
@ -481,7 +511,7 @@ else
c -= '0'; c -= '0';
while (g_ascii_isdigit(ptr[1]) != 0) while (g_ascii_isdigit(ptr[1]) != 0)
c = c * 10 + *(++ptr) - '0'; c = c * 10 + *(++ptr) - '0';
if (c < 0) if (c < 0) /* Integer overflow */
{ {
*errorcodeptr = ERR61; *errorcodeptr = ERR61;
break; break;
@ -822,7 +852,7 @@ be terminated by '>' because that is checked in the first pass.
Arguments: Arguments:
ptr current position in the pattern ptr current position in the pattern
count current count of capturing parens so far encountered cd compile background data
name name to seek, or NULL if seeking a numbered subpattern name name to seek, or NULL if seeking a numbered subpattern
lorn name length, or subpattern number if name is NULL lorn name length, or subpattern number if name is NULL
xmode TRUE if we are in /x mode xmode TRUE if we are in /x mode
@ -831,10 +861,11 @@ Returns: the number of the named subpattern, or -1 if not found
*/ */
static int static int
find_parens(const uschar *ptr, int count, const uschar *name, int lorn, find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
BOOL xmode) BOOL xmode)
{ {
const uschar *thisname; const uschar *thisname;
int count = cd->bracount;
for (; *ptr != 0; ptr++) for (; *ptr != 0; ptr++)
{ {
@ -854,10 +885,34 @@ for (; *ptr != 0; ptr++)
continue; continue;
} }
/* Skip over character classes */ /* Skip over character classes; this logic must be similar to the way they
are handled for real. If the first character is '^', skip it. Also, if the
first few characters (either before or after ^) are \Q\E or \E we skip them
too. This makes for compatibility with Perl. */
if (*ptr == '[') if (*ptr == '[')
{ {
BOOL negate_class = FALSE;
for (;;)
{
int c = *(++ptr);
if (c == '\\')
{
if (ptr[1] == 'E') ptr++;
else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
else break;
}
else if (!negate_class && c == '^')
negate_class = TRUE;
else break;
}
/* If the next character is ']', it is a data character that must be
skipped, except in JavaScript compatibility mode. */
if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
ptr++;
while (*(++ptr) != ']') while (*(++ptr) != ']')
{ {
if (*ptr == 0) return -1; if (*ptr == 0) return -1;
@ -1122,6 +1177,7 @@ for (;;)
case OP_NOT_WORDCHAR: case OP_NOT_WORDCHAR:
case OP_WORDCHAR: case OP_WORDCHAR:
case OP_ANY: case OP_ANY:
case OP_ALLANY:
branchlength++; branchlength++;
cc++; cc++;
break; break;
@ -1414,7 +1470,7 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE
/* Groups with zero repeats can of course be empty; skip them. */ /* Groups with zero repeats can of course be empty; skip them. */
if (c == OP_BRAZERO || c == OP_BRAMINZERO) if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
{ {
code += _pcre_OP_lengths[c]; code += _pcre_OP_lengths[c];
do code += GET(code, 1); while (*code == OP_ALT); do code += GET(code, 1); while (*code == OP_ALT);
@ -1500,6 +1556,7 @@ for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE
case OP_NOT_WORDCHAR: case OP_NOT_WORDCHAR:
case OP_WORDCHAR: case OP_WORDCHAR:
case OP_ANY: case OP_ANY:
case OP_ALLANY:
case OP_ANYBYTE: case OP_ANYBYTE:
case OP_CHAR: case OP_CHAR:
case OP_CHARNC: case OP_CHARNC:
@ -1694,11 +1751,12 @@ return -1;
that is referenced. This means that groups can be replicated for fixed that is referenced. This means that groups can be replicated for fixed
repetition simply by copying (because the recursion is allowed to refer to repetition simply by copying (because the recursion is allowed to refer to
earlier groups that are outside the current group). However, when a group is earlier groups that are outside the current group). However, when a group is
optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
it, after it has been compiled. This means that any OP_RECURSE items within it inserted before it, after it has been compiled. This means that any OP_RECURSE
that refer to the group itself or any contained groups have to have their items within it that refer to the group itself or any contained groups have to
offsets adjusted. That one of the jobs of this function. Before it is called, have their offsets adjusted. That one of the jobs of this function. Before it
the partially compiled regex must be temporarily terminated with OP_END. is called, the partially compiled regex must be temporarily terminated with
OP_END.
This function has been extended with the possibility of forward references for This function has been extended with the possibility of forward references for
recursions and subroutine calls. It must also check the list of such references recursions and subroutine calls. It must also check the list of such references
@ -1983,7 +2041,6 @@ if (next >= 0) switch(op_code)
/* For OP_NOT, "item" must be a single-byte character. */ /* For OP_NOT, "item" must be a single-byte character. */
case OP_NOT: case OP_NOT:
if (next < 0) return FALSE; /* Not a character */
if (item == next) return TRUE; if (item == next) return TRUE;
if ((options & PCRE_CASELESS) == 0) return FALSE; if ((options & PCRE_CASELESS) == 0) return FALSE;
#ifdef SUPPORT_UTF8 #ifdef SUPPORT_UTF8
@ -2486,7 +2543,7 @@ for (;; ptr++)
zerofirstbyte = firstbyte; zerofirstbyte = firstbyte;
zeroreqbyte = reqbyte; zeroreqbyte = reqbyte;
previous = code; previous = code;
*code++ = OP_ANY; *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
break; break;
@ -2501,7 +2558,17 @@ for (;; ptr++)
opcode is compiled. It may optionally have a bit map for characters < 256, opcode is compiled. It may optionally have a bit map for characters < 256,
but those above are are explicitly listed afterwards. A flag byte tells but those above are are explicitly listed afterwards. A flag byte tells
whether the bitmap is present, and whether this is a negated class or not. whether the bitmap is present, and whether this is a negated class or not.
*/
In JavaScript compatibility mode, an isolated ']' causes an error. In
default (Perl) mode, it is treated as a data character. */
case ']':
if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
{
*errorcodeptr = ERR64;
goto FAILED;
}
goto NORMAL_CHAR;
case '[': case '[':
previous = code; previous = code;
@ -2535,6 +2602,19 @@ for (;; ptr++)
else break; else break;
} }
/* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
an initial ']' is taken as a data character -- the code below handles
that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
[^] must match any character, so generate OP_ALLANY. */
if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
{
*code++ = negate_class? OP_ALLANY : OP_FAIL;
if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
zerofirstbyte = firstbyte;
break;
}
/* If a class contains a negative special such as \S, we need to flip the /* If a class contains a negative special such as \S, we need to flip the
negation flag at the end, so that support for characters > 255 works negation flag at the end, so that support for characters > 255 works
correctly (they are all included in the class). */ correctly (they are all included in the class). */
@ -3690,28 +3770,38 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
if (repeat_min == 0) if (repeat_min == 0)
{ {
/* If the maximum is also zero, we just omit the group from the output /* If the maximum is also zero, we used to just omit the group from the
altogether. */ output altogether, like this:
if (repeat_max == 0) ** if (repeat_max == 0)
{ ** {
code = previous; ** code = previous;
goto END_REPEAT; ** goto END_REPEAT;
} ** }
/* If the maximum is 1 or unlimited, we just have to stick in the However, that fails when a group is referenced as a subroutine from
BRAZERO and do no more at this point. However, we do need to adjust elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
any OP_RECURSE calls inside the group that refer to the group itself or so that it is skipped on execution. As we don't have a list of which
any internal or forward referenced group, because the offset is from groups are referenced, we cannot do this selectively.
the start of the whole regex. Temporarily terminate the pattern while
doing this. */
if (repeat_max <= 1) If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
and do no more at this point. However, we do need to adjust any
OP_RECURSE calls inside the group that refer to the group itself or any
internal or forward referenced group, because the offset is from the
start of the whole regex. Temporarily terminate the pattern while doing
this. */
if (repeat_max <= 1) /* Covers 0, 1, and unlimited */
{ {
*code = OP_END; *code = OP_END;
adjust_recurse(previous, 1, utf8, cd, save_hwm); adjust_recurse(previous, 1, utf8, cd, save_hwm);
memmove(previous+1, previous, len); memmove(previous+1, previous, len);
code++; code++;
if (repeat_max == 0)
{
*previous++ = OP_SKIPZERO;
goto END_REPEAT;
}
*previous++ = OP_BRAZERO + repeat_type; *previous++ = OP_BRAZERO + repeat_type;
} }
@ -3906,6 +3996,13 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
} }
} }
/* If previous is OP_FAIL, it was generated by an empty class [] in
JavaScript mode. The other ways in which OP_FAIL can be generated, that is
by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
error above. We can just ignore the repeat in JS case. */
else if (*previous == OP_FAIL) goto END_REPEAT;
/* Else there's some kind of shambles */ /* Else there's some kind of shambles */
else else
@ -4192,7 +4289,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
/* Search the pattern for a forward reference */ /* Search the pattern for a forward reference */
else if ((i = find_parens(ptr, cd->bracount, name, namelen, else if ((i = find_parens(ptr, cd, name, namelen,
(options & PCRE_EXTENDED) != 0)) > 0) (options & PCRE_EXTENDED) != 0)) > 0)
{ {
PUT2(code, 2+LINK_SIZE, i); PUT2(code, 2+LINK_SIZE, i);
@ -4438,7 +4535,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
references (?P=name) and recursion (?P>name), as well as falling references (?P=name) and recursion (?P>name), as well as falling
through from the Perl recursion syntax (?&name). We also come here from through from the Perl recursion syntax (?&name). We also come here from
the Perl \k<name> or \k'name' back reference syntax and the \k{name} the Perl \k<name> or \k'name' back reference syntax and the \k{name}
.NET syntax. */ .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
NAMED_REF_OR_RECURSE: NAMED_REF_OR_RECURSE:
name = ++ptr; name = ++ptr;
@ -4489,7 +4586,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
recno = GET2(slot, 0); recno = GET2(slot, 0);
} }
else if ((recno = /* Forward back reference */ else if ((recno = /* Forward back reference */
find_parens(ptr, cd->bracount, name, namelen, find_parens(ptr, cd, name, namelen,
(options & PCRE_EXTENDED) != 0)) <= 0) (options & PCRE_EXTENDED) != 0)) <= 0)
{ {
*errorcodeptr = ERR15; *errorcodeptr = ERR15;
@ -4516,6 +4613,15 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
case '5': case '6': case '7': case '8': case '9': /* subroutine */ case '5': case '6': case '7': case '8': case '9': /* subroutine */
{ {
const uschar *called; const uschar *called;
terminator = ')';
/* Come here from the \g<...> and \g'...' code (Oniguruma
compatibility). However, the syntax has been checked to ensure that
the ... are a (signed) number, so that neither ERR63 nor ERR29 will
be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
ever be taken. */
HANDLE_NUMERICAL_RECURSION:
if ((refsign = *ptr) == '+') if ((refsign = *ptr) == '+')
{ {
@ -4537,7 +4643,7 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
while(g_ascii_isdigit(*ptr) != 0) while(g_ascii_isdigit(*ptr) != 0)
recno = recno * 10 + *ptr++ - '0'; recno = recno * 10 + *ptr++ - '0';
if (*ptr != ')') if (*ptr != terminator)
{ {
*errorcodeptr = ERR29; *errorcodeptr = ERR29;
goto FAILED; goto FAILED;
@ -4590,8 +4696,8 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
if (called == NULL) if (called == NULL)
{ {
if (find_parens(ptr, cd->bracount, NULL, recno, if (find_parens(ptr, cd, NULL, recno,
(options & PCRE_EXTENDED) != 0) < 0) (options & PCRE_EXTENDED) != 0) < 0)
{ {
*errorcodeptr = ERR15; *errorcodeptr = ERR15;
goto FAILED; goto FAILED;
@ -4961,6 +5067,64 @@ we set the flag only if there is a literal "\r" or "\n" in the class. */
zerofirstbyte = firstbyte; zerofirstbyte = firstbyte;
zeroreqbyte = reqbyte; zeroreqbyte = reqbyte;
/* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
is a subroutine call by number (Oniguruma syntax). In fact, the value
-ESC_g is returned only for these cases. So we don't need to check for <
or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
-ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
that is a synonym for a named back reference). */
if (-c == ESC_g)
{
const uschar *p;
save_hwm = cd->hwm; /* Normally this is set when '(' is read */
terminator = (*(++ptr) == '<')? '>' : '\'';
/* These two statements stop the compiler for warning about possibly
unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
fact, because we actually check for a number below, the paths that
would actually be in error are never taken. */
skipbytes = 0;
reset_bracount = FALSE;
/* Test for a name */
if (ptr[1] != '+' && ptr[1] != '-')
{
BOOL isnumber = TRUE;
for (p = ptr + 1; *p != 0 && *p != terminator; p++)
{
if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
if ((cd->ctypes[*p] & ctype_word) == 0) break;
}
if (*p != terminator)
{
*errorcodeptr = ERR57;
break;
}
if (isnumber)
{
ptr++;
goto HANDLE_NUMERICAL_RECURSION;
}
is_recurse = TRUE;
goto NAMED_REF_OR_RECURSE;
}
/* Test a signed number in angle brackets or quotes. */
p = ptr + 2;
while (g_ascii_isdigit(*p) != 0) p++;
if (*p != terminator)
{
*errorcodeptr = ERR57;
break;
}
ptr++;
goto HANDLE_NUMERICAL_RECURSION;
}
/* \k<name> or \k'name' is a back reference by name (Perl syntax). /* \k<name> or \k'name' is a back reference by name (Perl syntax).
We also support \k{name} (.NET syntax) */ We also support \k{name} (.NET syntax) */
@ -5467,14 +5631,14 @@ do {
if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE; if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
} }
/* .* is not anchored unless DOTALL is set and it isn't in brackets that /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
are or may be referenced. */ it isn't in brackets that are or may be referenced. */
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR || else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
op == OP_TYPEPOSSTAR) && op == OP_TYPEPOSSTAR))
(*options & PCRE_DOTALL) != 0)
{ {
if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE; if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
return FALSE;
} }
/* Check for explicit anchoring */ /* Check for explicit anchoring */

View File

@ -84,11 +84,11 @@ centralize the loading of these characters. In the case of Type * etc, the
small value. ***NOTE*** If the start of this table is modified, the two tables small value. ***NOTE*** If the start of this table is modified, the two tables
that follow must also be modified. */ that follow must also be modified. */
static uschar coptable[] = { static const uschar coptable[] = {
0, /* End */ 0, /* End */
0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */ 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */ 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
0, 0, /* Any, Anybyte */ 0, 0, 0, /* Any, AllAny, Anybyte */
0, 0, 0, /* NOTPROP, PROP, EXTUNI */ 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */ 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */ 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
@ -132,26 +132,26 @@ static uschar coptable[] = {
0, /* DEF */ 0, /* DEF */
0, 0, /* BRAZERO, BRAMINZERO */ 0, 0, /* BRAZERO, BRAMINZERO */
0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */ 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
0, 0 /* FAIL, ACCEPT */ 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
}; };
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
and \w */ and \w */
static uschar toptable1[] = { static const uschar toptable1[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
ctype_digit, ctype_digit, ctype_digit, ctype_digit,
ctype_space, ctype_space, ctype_space, ctype_space,
ctype_word, ctype_word, ctype_word, ctype_word,
0 /* OP_ANY */ 0, 0 /* OP_ANY, OP_ALLANY */
}; };
static uschar toptable2[] = { static const uschar toptable2[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
ctype_digit, 0, ctype_digit, 0,
ctype_space, 0, ctype_space, 0,
ctype_word, 0, ctype_word, 0,
1 /* OP_ANY */ 1, 1 /* OP_ANY, OP_ALLANY */
}; };
@ -223,8 +223,8 @@ Arguments:
rlevel function call recursion level rlevel function call recursion level
recursing regex recursive call level recursing regex recursive call level
Returns: > 0 => Returns: > 0 => number of match offset pairs placed in offsets
= 0 => = 0 => offsets overflowed; longest matches are present
-1 => failed to match -1 => failed to match
< -1 => some kind of unexpected problem < -1 => some kind of unexpected problem
@ -693,6 +693,13 @@ for (;;)
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0); ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
break; break;
/*-----------------------------------------------------------------*/
case OP_SKIPZERO:
code += 1 + GET(code, 2);
while (*code == OP_ALT) code += GET(code, 1);
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
break;
/*-----------------------------------------------------------------*/ /*-----------------------------------------------------------------*/
case OP_CIRC: case OP_CIRC:
if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) || if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
@ -732,7 +739,13 @@ for (;;)
/*-----------------------------------------------------------------*/ /*-----------------------------------------------------------------*/
case OP_ANY: case OP_ANY:
if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr))) if (clen > 0 && !IS_NEWLINE(ptr))
{ ADD_NEW(state_offset + 1, 0); }
break;
/*-----------------------------------------------------------------*/
case OP_ALLANY:
if (clen > 0)
{ ADD_NEW(state_offset + 1, 0); } { ADD_NEW(state_offset + 1, 0); }
break; break;
@ -852,8 +865,8 @@ for (;;)
/* ========================================================================== */ /* ========================================================================== */
/* These opcodes likewise inspect the subject character, but have an /* These opcodes likewise inspect the subject character, but have an
argument that is not a data character. It is one of these opcodes: argument that is not a data character. It is one of these opcodes:
OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR, OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
OP_NOT_WORDCHAR. The value is loaded into d. */ OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
case OP_TYPEPLUS: case OP_TYPEPLUS:
case OP_TYPEMINPLUS: case OP_TYPEMINPLUS:
@ -864,10 +877,7 @@ for (;;)
{ {
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 && (c < 256 &&
(d != OP_ANY || (d != OP_ANY || !IS_NEWLINE(ptr)) &&
(ims & PCRE_DOTALL) != 0 ||
!IS_NEWLINE(ptr)
) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{ {
if (count > 0 && codevalue == OP_TYPEPOSPLUS) if (count > 0 && codevalue == OP_TYPEPOSPLUS)
@ -890,10 +900,7 @@ for (;;)
{ {
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 && (c < 256 &&
(d != OP_ANY || (d != OP_ANY || !IS_NEWLINE(ptr)) &&
(ims & PCRE_DOTALL) != 0 ||
!IS_NEWLINE(ptr)
) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{ {
if (codevalue == OP_TYPEPOSQUERY) if (codevalue == OP_TYPEPOSQUERY)
@ -915,10 +922,7 @@ for (;;)
{ {
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 && (c < 256 &&
(d != OP_ANY || (d != OP_ANY || !IS_NEWLINE(ptr)) &&
(ims & PCRE_DOTALL) != 0 ||
!IS_NEWLINE(ptr)
) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{ {
if (codevalue == OP_TYPEPOSSTAR) if (codevalue == OP_TYPEPOSSTAR)
@ -938,10 +942,7 @@ for (;;)
{ {
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 && (c < 256 &&
(d != OP_ANY || (d != OP_ANY || !IS_NEWLINE(ptr)) &&
(ims & PCRE_DOTALL) != 0 ||
!IS_NEWLINE(ptr)
) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{ {
if (++count >= GET2(code, 1)) if (++count >= GET2(code, 1))
@ -962,10 +963,7 @@ for (;;)
{ {
if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) || if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
(c < 256 && (c < 256 &&
(d != OP_ANY || (d != OP_ANY || !IS_NEWLINE(ptr)) &&
(ims & PCRE_DOTALL) != 0 ||
!IS_NEWLINE(ptr)
) &&
((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0)) ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
{ {
if (codevalue == OP_TYPEPOSUPTO) if (codevalue == OP_TYPEPOSUPTO)
@ -2162,7 +2160,12 @@ for (;;)
/* ========================================================================== */ /* ========================================================================== */
/* These are the opcodes for fancy brackets of various kinds. We have /* These are the opcodes for fancy brackets of various kinds. We have
to use recursion in order to handle them. */ to use recursion in order to handle them. The "always failing" assersion
(?!) is optimised when compiling to OP_FAIL, so we have to support that,
though the other "backtracking verbs" are not supported. */
case OP_FAIL:
break;
case OP_ASSERT: case OP_ASSERT:
case OP_ASSERT_NOT: case OP_ASSERT_NOT:

View File

@ -1148,11 +1148,11 @@ for (;;)
do ecode += GET(ecode,1); while (*ecode == OP_ALT); do ecode += GET(ecode,1); while (*ecode == OP_ALT);
break; break;
/* BRAZERO and BRAMINZERO occur just before a bracket group, indicating /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
that it may occur zero times. It may repeat infinitely, or not at all - indicating that it may occur zero times. It may repeat infinitely, or not
i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
repeat limits are compiled as a number of copies, with the optional ones with fixed upper repeat limits are compiled as a number of copies, with the
preceded by BRAZERO or BRAMINZERO. */ optional ones preceded by BRAZERO or BRAMINZERO. */
case OP_BRAZERO: case OP_BRAZERO:
{ {
@ -1174,6 +1174,14 @@ for (;;)
} }
break; break;
case OP_SKIPZERO:
{
next = ecode+1;
do next += GET(next,1); while (*next == OP_ALT);
ecode = next + 1 + LINK_SIZE;
}
break;
/* End of a group, repeated or non-repeating. */ /* End of a group, repeated or non-repeating. */
case OP_KET: case OP_KET:
@ -1421,13 +1429,12 @@ for (;;)
/* Match a single character type; inline for speed */ /* Match a single character type; inline for speed */
case OP_ANY: case OP_ANY:
if ((ims & PCRE_DOTALL) == 0) if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
{ /* Fall through */
if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
} case OP_ALLANY:
if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH); if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
if (utf8) if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
ecode++; ecode++;
break; break;
@ -1723,16 +1730,25 @@ for (;;)
case OP_REF: case OP_REF:
{ {
offset = GET2(ecode, 1) << 1; /* Doubled ref number */ offset = GET2(ecode, 1) << 1; /* Doubled ref number */
ecode += 3; /* Advance past item */ ecode += 3;
/* If the reference is unset, set the length to be longer than the amount /* If the reference is unset, there are two possibilities:
of subject left; this ensures that every attempt at a match fails. We
can't just fail here, because of the possibility of quantifiers with zero
minima. */
length = (offset >= offset_top || md->offset_vector[offset] < 0)? (a) In the default, Perl-compatible state, set the length to be longer
md->end_subject - eptr + 1 : than the amount of subject left; this ensures that every attempt at a
md->offset_vector[offset+1] - md->offset_vector[offset]; match fails. We can't just fail here, because of the possibility of
quantifiers with zero minima.
(b) If the JavaScript compatibility flag is set, set the length to zero
so that the back reference matches an empty string.
Otherwise, set the length to the length of what was matched by the
referenced subpattern. */
if (offset >= offset_top || md->offset_vector[offset] < 0)
length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
else
length = md->offset_vector[offset+1] - md->offset_vector[offset];
/* Set up for repetition, or handle the non-repeated case */ /* Set up for repetition, or handle the non-repeated case */
@ -2935,14 +2951,22 @@ for (;;)
case OP_ANY: case OP_ANY:
for (i = 1; i <= min; i++) for (i = 1; i <= min; i++)
{ {
if (eptr >= md->end_subject || if (eptr >= md->end_subject || IS_NEWLINE(eptr))
((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH); RRETURN(MATCH_NOMATCH);
eptr++; eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
} }
break; break;
case OP_ALLANY:
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
break;
case OP_ANYBYTE: case OP_ANYBYTE:
eptr += min; eptr += min;
break; break;
@ -3151,15 +3175,15 @@ for (;;)
switch(ctype) switch(ctype)
{ {
case OP_ANY: case OP_ANY:
if ((ims & PCRE_DOTALL) == 0) for (i = 1; i <= min; i++)
{ {
for (i = 1; i <= min; i++) if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
{ eptr++;
if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
eptr++;
}
} }
else eptr += min; break;
case OP_ALLANY:
eptr += min;
break; break;
case OP_ANYBYTE: case OP_ANYBYTE:
@ -3416,16 +3440,14 @@ for (;;)
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42); RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject || if (fi >= max || eptr >= md->end_subject ||
(ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 && (ctype == OP_ANY && IS_NEWLINE(eptr)))
IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH); RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr); GETCHARINC(c, eptr);
switch(ctype) switch(ctype)
{ {
case OP_ANY: /* This is the DOTALL case */ case OP_ANY: /* This is the non-NL case */
break; case OP_ALLANY:
case OP_ANYBYTE: case OP_ANYBYTE:
break; break;
@ -3577,15 +3599,14 @@ for (;;)
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43); RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
if (rrc != MATCH_NOMATCH) RRETURN(rrc); if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject || if (fi >= max || eptr >= md->end_subject ||
((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr))) (ctype == OP_ANY && IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH); RRETURN(MATCH_NOMATCH);
c = *eptr++; c = *eptr++;
switch(ctype) switch(ctype)
{ {
case OP_ANY: /* This is the DOTALL case */ case OP_ANY: /* This is the non-NL case */
break; case OP_ALLANY:
case OP_ANYBYTE: case OP_ANYBYTE:
break; break;
@ -3839,23 +3860,11 @@ for (;;)
case OP_ANY: case OP_ANY:
if (max < INT_MAX) if (max < INT_MAX)
{ {
if ((ims & PCRE_DOTALL) == 0) for (i = min; i < max; i++)
{ {
for (i = min; i < max; i++) if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
{ eptr++;
if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
}
else
{
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject) break;
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
} }
} }
@ -3863,22 +3872,28 @@ for (;;)
else else
{ {
if ((ims & PCRE_DOTALL) == 0) for (i = min; i < max; i++)
{ {
for (i = min; i < max; i++) if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
{ eptr++;
if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break; while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
}
else
{
eptr = md->end_subject;
} }
} }
break; break;
case OP_ALLANY:
if (max < INT_MAX)
{
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject) break;
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
}
else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
break;
/* The byte case is the same as non-UTF8 */ /* The byte case is the same as non-UTF8 */
case OP_ANYBYTE: case OP_ANYBYTE:
@ -4064,17 +4079,14 @@ for (;;)
switch(ctype) switch(ctype)
{ {
case OP_ANY: case OP_ANY:
if ((ims & PCRE_DOTALL) == 0) for (i = min; i < max; i++)
{ {
for (i = min; i < max; i++) if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
{ eptr++;
if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
eptr++;
}
break;
} }
/* For DOTALL case, fall through and treat as \C */ break;
case OP_ALLANY:
case OP_ANYBYTE: case OP_ANYBYTE:
c = max - min; c = max - min;
if (c > (unsigned int)(md->end_subject - eptr)) if (c > (unsigned int)(md->end_subject - eptr))
@ -4450,6 +4462,7 @@ end_subject = md->end_subject;
md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0; md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0; utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
md->notbol = (options & PCRE_NOTBOL) != 0; md->notbol = (options & PCRE_NOTBOL) != 0;
md->noteol = (options & PCRE_NOTEOL) != 0; md->noteol = (options & PCRE_NOTEOL) != 0;

View File

@ -52,6 +52,8 @@ differently, and global variables are not used (see pcre.in). */
#include "pcre_internal.h" #include "pcre_internal.h"
#ifndef VPCOMPAT
PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL; PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL;
#endif
/* End of pcre_globals.c */ /* End of pcre_globals.c */

View File

@ -514,7 +514,8 @@ time, run time, or study time, respectively. */
(PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \ PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE) PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
PCRE_JAVASCRIPT_COMPAT)
#define PUBLIC_EXEC_OPTIONS \ #define PUBLIC_EXEC_OPTIONS \
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
@ -601,16 +602,20 @@ contain UTF-8 characters with values greater than 255. */
value such as \n. They must have non-zero values, as check_escape() returns value such as \n. They must have non-zero values, as check_escape() returns
their negation. Also, they must appear in the same order as in the opcode their negation. Also, they must appear in the same order as in the opcode
definitions below, up to ESC_z. There's a dummy for OP_ANY because it definitions below, up to ESC_z. There's a dummy for OP_ANY because it
corresponds to "." rather than an escape sequence. The final one must be corresponds to "." rather than an escape sequence, and another for OP_ALLANY
ESC_REF as subsequent values are used for backreferences (\1, \2, \3, etc). (which is used for [^] in JavaScript compatibility mode).
There are two tests in the code for an escape greater than ESC_b and less than
ESC_Z to detect the types that may be repeated. These are the types that The final escape must be ESC_REF as subsequent values are used for
consume characters. If any new escapes are put in between that don't consume a backreferences (\1, \2, \3, etc). There are two tests in the code for an escape
character, that code will have to change. */ greater than ESC_b and less than ESC_Z to detect the types that may be
repeated. These are the types that consume characters. If any new escapes are
put in between that don't consume a character, that code will have to change.
*/
enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
ESC_W, ESC_w, ESC_dum1, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, ESC_h, ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_k, ESC_REF }; ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k,
ESC_REF };
/* Opcode table: Starting from 1 (i.e. after OP_END), the values up to /* Opcode table: Starting from 1 (i.e. after OP_END), the values up to
@ -636,141 +641,146 @@ enum {
OP_WHITESPACE, /* 9 \s */ OP_WHITESPACE, /* 9 \s */
OP_NOT_WORDCHAR, /* 10 \W */ OP_NOT_WORDCHAR, /* 10 \W */
OP_WORDCHAR, /* 11 \w */ OP_WORDCHAR, /* 11 \w */
OP_ANY, /* 12 Match any character */ OP_ANY, /* 12 Match any character (subject to DOTALL) */
OP_ANYBYTE, /* 13 Match any byte (\C); different to OP_ANY for UTF-8 */ OP_ALLANY, /* 13 Match any character (not subject to DOTALL) */
OP_NOTPROP, /* 14 \P (not Unicode property) */ OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
OP_PROP, /* 15 \p (Unicode property) */ OP_NOTPROP, /* 15 \P (not Unicode property) */
OP_ANYNL, /* 16 \R (any newline sequence) */ OP_PROP, /* 16 \p (Unicode property) */
OP_NOT_HSPACE, /* 17 \H (not horizontal whitespace) */ OP_ANYNL, /* 17 \R (any newline sequence) */
OP_HSPACE, /* 18 \h (horizontal whitespace) */ OP_NOT_HSPACE, /* 18 \H (not horizontal whitespace) */
OP_NOT_VSPACE, /* 19 \V (not vertical whitespace) */ OP_HSPACE, /* 19 \h (horizontal whitespace) */
OP_VSPACE, /* 20 \v (vertical whitespace) */ OP_NOT_VSPACE, /* 20 \V (not vertical whitespace) */
OP_EXTUNI, /* 21 \X (extended Unicode sequence */ OP_VSPACE, /* 21 \v (vertical whitespace) */
OP_EODN, /* 22 End of data or \n at end of data: \Z. */ OP_EXTUNI, /* 22 \X (extended Unicode sequence */
OP_EOD, /* 23 End of data: \z */ OP_EODN, /* 23 End of data or \n at end of data: \Z. */
OP_EOD, /* 24 End of data: \z */
OP_OPT, /* 24 Set runtime options */ OP_OPT, /* 25 Set runtime options */
OP_CIRC, /* 25 Start of line - varies with multiline switch */ OP_CIRC, /* 26 Start of line - varies with multiline switch */
OP_DOLL, /* 26 End of line - varies with multiline switch */ OP_DOLL, /* 27 End of line - varies with multiline switch */
OP_CHAR, /* 27 Match one character, casefully */ OP_CHAR, /* 28 Match one character, casefully */
OP_CHARNC, /* 28 Match one character, caselessly */ OP_CHARNC, /* 29 Match one character, caselessly */
OP_NOT, /* 29 Match one character, not the following one */ OP_NOT, /* 30 Match one character, not the following one */
OP_STAR, /* 30 The maximizing and minimizing versions of */ OP_STAR, /* 31 The maximizing and minimizing versions of */
OP_MINSTAR, /* 31 these six opcodes must come in pairs, with */ OP_MINSTAR, /* 32 these six opcodes must come in pairs, with */
OP_PLUS, /* 32 the minimizing one second. */ OP_PLUS, /* 33 the minimizing one second. */
OP_MINPLUS, /* 33 This first set applies to single characters.*/ OP_MINPLUS, /* 34 This first set applies to single characters.*/
OP_QUERY, /* 34 */ OP_QUERY, /* 35 */
OP_MINQUERY, /* 35 */ OP_MINQUERY, /* 36 */
OP_UPTO, /* 36 From 0 to n matches */ OP_UPTO, /* 37 From 0 to n matches */
OP_MINUPTO, /* 37 */ OP_MINUPTO, /* 38 */
OP_EXACT, /* 38 Exactly n matches */ OP_EXACT, /* 39 Exactly n matches */
OP_POSSTAR, /* 39 Possessified star */ OP_POSSTAR, /* 40 Possessified star */
OP_POSPLUS, /* 40 Possessified plus */ OP_POSPLUS, /* 41 Possessified plus */
OP_POSQUERY, /* 41 Posesssified query */ OP_POSQUERY, /* 42 Posesssified query */
OP_POSUPTO, /* 42 Possessified upto */ OP_POSUPTO, /* 43 Possessified upto */
OP_NOTSTAR, /* 43 The maximizing and minimizing versions of */ OP_NOTSTAR, /* 44 The maximizing and minimizing versions of */
OP_NOTMINSTAR, /* 44 these six opcodes must come in pairs, with */ OP_NOTMINSTAR, /* 45 these six opcodes must come in pairs, with */
OP_NOTPLUS, /* 45 the minimizing one second. They must be in */ OP_NOTPLUS, /* 46 the minimizing one second. They must be in */
OP_NOTMINPLUS, /* 46 exactly the same order as those above. */ OP_NOTMINPLUS, /* 47 exactly the same order as those above. */
OP_NOTQUERY, /* 47 This set applies to "not" single characters. */ OP_NOTQUERY, /* 48 This set applies to "not" single characters. */
OP_NOTMINQUERY, /* 48 */ OP_NOTMINQUERY, /* 49 */
OP_NOTUPTO, /* 49 From 0 to n matches */ OP_NOTUPTO, /* 50 From 0 to n matches */
OP_NOTMINUPTO, /* 50 */ OP_NOTMINUPTO, /* 51 */
OP_NOTEXACT, /* 51 Exactly n matches */ OP_NOTEXACT, /* 52 Exactly n matches */
OP_NOTPOSSTAR, /* 52 Possessified versions */ OP_NOTPOSSTAR, /* 53 Possessified versions */
OP_NOTPOSPLUS, /* 53 */ OP_NOTPOSPLUS, /* 54 */
OP_NOTPOSQUERY, /* 54 */ OP_NOTPOSQUERY, /* 55 */
OP_NOTPOSUPTO, /* 55 */ OP_NOTPOSUPTO, /* 56 */
OP_TYPESTAR, /* 56 The maximizing and minimizing versions of */ OP_TYPESTAR, /* 57 The maximizing and minimizing versions of */
OP_TYPEMINSTAR, /* 57 these six opcodes must come in pairs, with */ OP_TYPEMINSTAR, /* 58 these six opcodes must come in pairs, with */
OP_TYPEPLUS, /* 58 the minimizing one second. These codes must */ OP_TYPEPLUS, /* 59 the minimizing one second. These codes must */
OP_TYPEMINPLUS, /* 59 be in exactly the same order as those above. */ OP_TYPEMINPLUS, /* 60 be in exactly the same order as those above. */
OP_TYPEQUERY, /* 60 This set applies to character types such as \d */ OP_TYPEQUERY, /* 61 This set applies to character types such as \d */
OP_TYPEMINQUERY, /* 61 */ OP_TYPEMINQUERY, /* 62 */
OP_TYPEUPTO, /* 62 From 0 to n matches */ OP_TYPEUPTO, /* 63 From 0 to n matches */
OP_TYPEMINUPTO, /* 63 */ OP_TYPEMINUPTO, /* 64 */
OP_TYPEEXACT, /* 64 Exactly n matches */ OP_TYPEEXACT, /* 65 Exactly n matches */
OP_TYPEPOSSTAR, /* 65 Possessified versions */ OP_TYPEPOSSTAR, /* 66 Possessified versions */
OP_TYPEPOSPLUS, /* 66 */ OP_TYPEPOSPLUS, /* 67 */
OP_TYPEPOSQUERY, /* 67 */ OP_TYPEPOSQUERY, /* 68 */
OP_TYPEPOSUPTO, /* 68 */ OP_TYPEPOSUPTO, /* 69 */
OP_CRSTAR, /* 69 The maximizing and minimizing versions of */ OP_CRSTAR, /* 70 The maximizing and minimizing versions of */
OP_CRMINSTAR, /* 70 all these opcodes must come in pairs, with */ OP_CRMINSTAR, /* 71 all these opcodes must come in pairs, with */
OP_CRPLUS, /* 71 the minimizing one second. These codes must */ OP_CRPLUS, /* 72 the minimizing one second. These codes must */
OP_CRMINPLUS, /* 72 be in exactly the same order as those above. */ OP_CRMINPLUS, /* 73 be in exactly the same order as those above. */
OP_CRQUERY, /* 73 These are for character classes and back refs */ OP_CRQUERY, /* 74 These are for character classes and back refs */
OP_CRMINQUERY, /* 74 */ OP_CRMINQUERY, /* 75 */
OP_CRRANGE, /* 75 These are different to the three sets above. */ OP_CRRANGE, /* 76 These are different to the three sets above. */
OP_CRMINRANGE, /* 76 */ OP_CRMINRANGE, /* 77 */
OP_CLASS, /* 77 Match a character class, chars < 256 only */ OP_CLASS, /* 78 Match a character class, chars < 256 only */
OP_NCLASS, /* 78 Same, but the bitmap was created from a negative OP_NCLASS, /* 79 Same, but the bitmap was created from a negative
class - the difference is relevant only when a UTF-8 class - the difference is relevant only when a UTF-8
character > 255 is encountered. */ character > 255 is encountered. */
OP_XCLASS, /* 79 Extended class for handling UTF-8 chars within the OP_XCLASS, /* 80 Extended class for handling UTF-8 chars within the
class. This does both positive and negative. */ class. This does both positive and negative. */
OP_REF, /* 80 Match a back reference */ OP_REF, /* 81 Match a back reference */
OP_RECURSE, /* 81 Match a numbered subpattern (possibly recursive) */ OP_RECURSE, /* 82 Match a numbered subpattern (possibly recursive) */
OP_CALLOUT, /* 82 Call out to external function if provided */ OP_CALLOUT, /* 83 Call out to external function if provided */
OP_ALT, /* 83 Start of alternation */ OP_ALT, /* 84 Start of alternation */
OP_KET, /* 84 End of group that doesn't have an unbounded repeat */ OP_KET, /* 85 End of group that doesn't have an unbounded repeat */
OP_KETRMAX, /* 85 These two must remain together and in this */ OP_KETRMAX, /* 86 These two must remain together and in this */
OP_KETRMIN, /* 86 order. They are for groups the repeat for ever. */ OP_KETRMIN, /* 87 order. They are for groups the repeat for ever. */
/* The assertions must come before BRA, CBRA, ONCE, and COND.*/ /* The assertions must come before BRA, CBRA, ONCE, and COND.*/
OP_ASSERT, /* 87 Positive lookahead */ OP_ASSERT, /* 88 Positive lookahead */
OP_ASSERT_NOT, /* 88 Negative lookahead */ OP_ASSERT_NOT, /* 89 Negative lookahead */
OP_ASSERTBACK, /* 89 Positive lookbehind */ OP_ASSERTBACK, /* 90 Positive lookbehind */
OP_ASSERTBACK_NOT, /* 90 Negative lookbehind */ OP_ASSERTBACK_NOT, /* 91 Negative lookbehind */
OP_REVERSE, /* 91 Move pointer back - used in lookbehind assertions */ OP_REVERSE, /* 92 Move pointer back - used in lookbehind assertions */
/* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first, /* ONCE, BRA, CBRA, and COND must come after the assertions, with ONCE first,
as there's a test for >= ONCE for a subpattern that isn't an assertion. */ as there's a test for >= ONCE for a subpattern that isn't an assertion. */
OP_ONCE, /* 92 Atomic group */ OP_ONCE, /* 93 Atomic group */
OP_BRA, /* 93 Start of non-capturing bracket */ OP_BRA, /* 94 Start of non-capturing bracket */
OP_CBRA, /* 94 Start of capturing bracket */ OP_CBRA, /* 95 Start of capturing bracket */
OP_COND, /* 95 Conditional group */ OP_COND, /* 96 Conditional group */
/* These three must follow the previous three, in the same order. There's a /* These three must follow the previous three, in the same order. There's a
check for >= SBRA to distinguish the two sets. */ check for >= SBRA to distinguish the two sets. */
OP_SBRA, /* 96 Start of non-capturing bracket, check empty */ OP_SBRA, /* 97 Start of non-capturing bracket, check empty */
OP_SCBRA, /* 97 Start of capturing bracket, check empty */ OP_SCBRA, /* 98 Start of capturing bracket, check empty */
OP_SCOND, /* 98 Conditional group, check empty */ OP_SCOND, /* 99 Conditional group, check empty */
OP_CREF, /* 99 Used to hold a capture number as condition */ OP_CREF, /* 100 Used to hold a capture number as condition */
OP_RREF, /* 100 Used to hold a recursion number as condition */ OP_RREF, /* 101 Used to hold a recursion number as condition */
OP_DEF, /* 101 The DEFINE condition */ OP_DEF, /* 102 The DEFINE condition */
OP_BRAZERO, /* 102 These two must remain together and in this */ OP_BRAZERO, /* 103 These two must remain together and in this */
OP_BRAMINZERO, /* 103 order. */ OP_BRAMINZERO, /* 104 order. */
/* These are backtracking control verbs */ /* These are backtracking control verbs */
OP_PRUNE, /* 104 */ OP_PRUNE, /* 105 */
OP_SKIP, /* 105 */ OP_SKIP, /* 106 */
OP_THEN, /* 106 */ OP_THEN, /* 107 */
OP_COMMIT, /* 107 */ OP_COMMIT, /* 108 */
/* These are forced failure and success verbs */ /* These are forced failure and success verbs */
OP_FAIL, /* 108 */ OP_FAIL, /* 109 */
OP_ACCEPT /* 109 */ OP_ACCEPT, /* 110 */
/* This is used to skip a subpattern with a {0} quantifier */
OP_SKIPZERO /* 111 */
}; };
@ -779,7 +789,7 @@ for debugging. The macro is referenced only in pcre_printint.c. */
#define OP_NAME_LIST \ #define OP_NAME_LIST \
"End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \ "End", "\\A", "\\G", "\\K", "\\B", "\\b", "\\D", "\\d", \
"\\S", "\\s", "\\W", "\\w", "Any", "Anybyte", \ "\\S", "\\s", "\\W", "\\w", "Any", "AllAny", "Anybyte", \
"notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \ "notprop", "prop", "\\R", "\\H", "\\h", "\\V", "\\v", \
"extuni", "\\Z", "\\z", \ "extuni", "\\Z", "\\z", \
"Opt", "^", "$", "char", "charnc", "not", \ "Opt", "^", "$", "char", "charnc", "not", \
@ -795,7 +805,8 @@ for debugging. The macro is referenced only in pcre_printint.c. */
"AssertB", "AssertB not", "Reverse", \ "AssertB", "AssertB not", "Reverse", \
"Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \ "Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
"Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero", \ "Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero", \
"*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT" "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
"Skip zero"
/* This macro defines the length of fixed length operations in the compiled /* This macro defines the length of fixed length operations in the compiled
@ -811,7 +822,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */
1, /* End */ \ 1, /* End */ \
1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \ 1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \
1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \ 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \
1, 1, /* Any, Anybyte */ \ 1, 1, 1, /* Any, AllAny, Anybyte */ \
3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \ 3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \ 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \
1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \ 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
@ -860,7 +871,7 @@ in UTF-8 mode. The code that uses this table must know about such things. */
1, /* DEF */ \ 1, /* DEF */ \
1, 1, /* BRAZERO, BRAMINZERO */ \ 1, 1, /* BRAZERO, BRAMINZERO */ \
1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \ 1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \
1, 1 /* FAIL, ACCEPT */ 1, 1, 1 /* FAIL, ACCEPT, SKIPZERO */
/* A magic value for OP_RREF to indicate the "any recursion" condition. */ /* A magic value for OP_RREF to indicate the "any recursion" condition. */
@ -876,7 +887,7 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
ERR60, ERR61, ERR62, ERR63 }; ERR60, ERR61, ERR62, ERR63, ERR64 };
/* The real format of the start of the pcre block; the index of names and the /* The real format of the start of the pcre block; the index of names and the
code vector run on as long as necessary after the end. We store an explicit code vector run on as long as necessary after the end. We store an explicit
@ -1001,6 +1012,7 @@ typedef struct match_data {
BOOL notbol; /* NOTBOL flag */ BOOL notbol; /* NOTBOL flag */
BOOL noteol; /* NOTEOL flag */ BOOL noteol; /* NOTEOL flag */
BOOL utf8; /* UTF8 flag */ BOOL utf8; /* UTF8 flag */
BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */
BOOL endonly; /* Dollar not before final \n */ BOOL endonly; /* Dollar not before final \n */
BOOL notempty; /* Empty string match not wanted */ BOOL notempty; /* Empty string match not wanted */
BOOL partial; /* PARTIAL flag */ BOOL partial; /* PARTIAL flag */

View File

@ -217,6 +217,13 @@ do
tcode += 1 + LINK_SIZE; tcode += 1 + LINK_SIZE;
break; break;
/* SKIPZERO skips the bracket. */
case OP_SKIPZERO:
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
tcode += 1 + LINK_SIZE;
break;
/* Single-char * or ? sets the bit and tries the next item */ /* Single-char * or ? sets the bit and tries the next item */
case OP_STAR: case OP_STAR:
@ -341,6 +348,7 @@ do
switch(tcode[1]) switch(tcode[1])
{ {
case OP_ANY: case OP_ANY:
case OP_ALLANY:
return SSB_FAIL; return SSB_FAIL;
case OP_NOT_DIGIT: case OP_NOT_DIGIT:

View File

@ -17,7 +17,7 @@ typedef struct cnode {
#define f0_scriptmask 0xff000000 /* Mask for script field */ #define f0_scriptmask 0xff000000 /* Mask for script field */
#define f0_scriptshift 24 /* Shift for script value */ #define f0_scriptshift 24 /* Shift for script value */
#define f0_rangeflag 0x00f00000 /* Flag for a range item */ #define f0_rangeflag 0x00800000 /* Flag for a range item */
#define f0_charmask 0x001fffff /* Mask for code point value */ #define f0_charmask 0x001fffff /* Mask for code point value */
/* Things for the f1 field */ /* Things for the f1 field */