Update the include pcre to 8.02

This commit is contained in:
Matthias Clasen 2010-06-20 01:46:35 -04:00
parent b0b7aeffc0
commit 85621f1a0f
18 changed files with 4156 additions and 1416 deletions

View File

@ -53,7 +53,6 @@ libpcre_la_SOURCES = \
pcre.h \ pcre.h \
pcre_internal.h \ pcre_internal.h \
ucp.h \ ucp.h \
ucpinternal.h \
$(libpcre_headers) $(libpcre_headers)
libpcre_la_LIBADD = $(DEP_LIBS) libpcre_la_LIBADD = $(DEP_LIBS)

View File

@ -5,7 +5,7 @@
/* This is the public header file for the PCRE library, to be #included by /* This is the public header file for the PCRE library, to be #included by
applications that call the PCRE functions. applications that call the PCRE functions.
Copyright (c) 1997-2008 University of Cambridge Copyright (c) 1997-2009 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -41,10 +41,10 @@ POSSIBILITY OF SUCH DAMAGE.
/* The current PCRE version information. */ /* The current PCRE version information. */
#define PCRE_MAJOR 7 #define PCRE_MAJOR 8
#define PCRE_MINOR 8 #define PCRE_MINOR 02
#define PCRE_PRERELEASE #define PCRE_PRERELEASE
#define PCRE_DATE 2008-09-05 #define PCRE_DATE 2010-03-19
/* When an application links to a PCRE DLL in Windows, the symbols that are /* When an application links to a PCRE DLL in Windows, the symbols that are
imported have to be identified as such. When building PCRE, the appropriate imported have to be identified as such. When building PCRE, the appropriate
@ -95,7 +95,8 @@ it is needed here for malloc. */
extern "C" { extern "C" {
#endif #endif
/* Options */ /* Options. Some are compile-time only, some are run-time only, and some are
both, so we keep them all distinct. */
#define PCRE_CASELESS 0x00000001 #define PCRE_CASELESS 0x00000001
#define PCRE_MULTILINE 0x00000002 #define PCRE_MULTILINE 0x00000002
@ -112,7 +113,8 @@ extern "C" {
#define PCRE_NO_AUTO_CAPTURE 0x00001000 #define PCRE_NO_AUTO_CAPTURE 0x00001000
#define PCRE_NO_UTF8_CHECK 0x00002000 #define PCRE_NO_UTF8_CHECK 0x00002000
#define PCRE_AUTO_CALLOUT 0x00004000 #define PCRE_AUTO_CALLOUT 0x00004000
#define PCRE_PARTIAL 0x00008000 #define PCRE_PARTIAL_SOFT 0x00008000
#define PCRE_PARTIAL 0x00008000 /* Backwards compatible synonym */
#define PCRE_DFA_SHORTEST 0x00010000 #define PCRE_DFA_SHORTEST 0x00010000
#define PCRE_DFA_RESTART 0x00020000 #define PCRE_DFA_RESTART 0x00020000
#define PCRE_FIRSTLINE 0x00040000 #define PCRE_FIRSTLINE 0x00040000
@ -125,6 +127,10 @@ extern "C" {
#define PCRE_BSR_ANYCRLF 0x00800000 #define PCRE_BSR_ANYCRLF 0x00800000
#define PCRE_BSR_UNICODE 0x01000000 #define PCRE_BSR_UNICODE 0x01000000
#define PCRE_JAVASCRIPT_COMPAT 0x02000000 #define PCRE_JAVASCRIPT_COMPAT 0x02000000
#define PCRE_NO_START_OPTIMIZE 0x04000000
#define PCRE_NO_START_OPTIMISE 0x04000000
#define PCRE_PARTIAL_HARD 0x08000000
#define PCRE_NOTEMPTY_ATSTART 0x10000000
/* Exec-time and get/set-time error codes */ /* Exec-time and get/set-time error codes */
@ -171,6 +177,7 @@ extern "C" {
#define PCRE_INFO_OKPARTIAL 12 #define PCRE_INFO_OKPARTIAL 12
#define PCRE_INFO_JCHANGED 13 #define PCRE_INFO_JCHANGED 13
#define PCRE_INFO_HASCRORLF 14 #define PCRE_INFO_HASCRORLF 14
#define PCRE_INFO_MINLENGTH 15
/* Request types for pcre_config(). Do not re-arrange, in order to remain /* Request types for pcre_config(). Do not re-arrange, in order to remain
compatible. */ compatible. */
@ -250,7 +257,7 @@ typedef struct pcre_callout_block {
#define pcre_free g_free #define pcre_free g_free
#define pcre_stack_malloc g_try_malloc #define pcre_stack_malloc g_try_malloc
PCRE_EXP_DECL int (*pcre_callout)(pcre_callout_block *); int (*pcre_callout)(pcre_callout_block *);
/* Exported PCRE functions */ /* Exported PCRE functions */

File diff suppressed because it is too large Load Diff

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language. and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Copyright (c) 1997-2008 University of Cambridge Copyright (c) 1997-2009 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -104,11 +104,11 @@ switch (what)
break; break;
case PCRE_CONFIG_MATCH_LIMIT: case PCRE_CONFIG_MATCH_LIMIT:
*((unsigned int *)where) = MATCH_LIMIT; *((unsigned long int *)where) = MATCH_LIMIT;
break; break;
case PCRE_CONFIG_MATCH_LIMIT_RECURSION: case PCRE_CONFIG_MATCH_LIMIT_RECURSION:
*((unsigned int *)where) = MATCH_LIMIT_RECURSION; *((unsigned long int *)where) = MATCH_LIMIT_RECURSION;
break; break;
case PCRE_CONFIG_STACKRECURSE: case PCRE_CONFIG_STACKRECURSE:

View File

@ -3,10 +3,11 @@
*************************************************/ *************************************************/
/* PCRE is a library of functions to support regular expressions whose syntax /* PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language. and semantics are as close as possible to those of the Perl 5 language (but see
below for why this module is different).
Written by Philip Hazel Written by Philip Hazel
Copyright (c) 1997-2008 University of Cambridge Copyright (c) 1997-2010 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -44,6 +45,34 @@ FSM). This is NOT Perl- compatible, but it has advantages in certain
applications. */ applications. */
/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
the performance of his patterns greatly. I could not use it as it stood, as it
was not thread safe, and made assumptions about pattern sizes. Also, it caused
test 7 to loop, and test 9 to crash with a segfault.
The issue is the check for duplicate states, which is done by a simple linear
search up the state list. (Grep for "duplicate" below to find the code.) For
many patterns, there will never be many states active at one time, so a simple
linear search is fine. In patterns that have many active states, it might be a
bottleneck. The suggested code used an indexing scheme to remember which states
had previously been used for each character, and avoided the linear search when
it knew there was no chance of a duplicate. This was implemented when adding
states to the state lists.
I wrote some thread-safe, not-limited code to try something similar at the time
of checking for duplicates (instead of when adding states), using index vectors
on the stack. It did give a 13% improvement with one specially constructed
pattern for certain subject strings, but on other strings and on many of the
simpler patterns in the test suite it did worse. The major problem, I think,
was the extra time to initialize the index. This had to be done for each call
of internal_dfa_exec(). (The supplied patch used a static vector, initialized
only once - I suspect this was the cause of the problems with the tests.)
Overall, I concluded that the gains in some cases did not outweigh the losses
in others, so I abandoned this code. */
#ifdef HAVE_CONFIG_H #ifdef HAVE_CONFIG_H
#include "config.h" #include "config.h"
#endif #endif
@ -60,7 +89,6 @@ applications. */
#define SP " " #define SP " "
/************************************************* /*************************************************
* Code parameters and static tables * * Code parameters and static tables *
*************************************************/ *************************************************/
@ -81,16 +109,18 @@ never stored, so we push them well clear of the normal opcodes. */
character that is to be tested in some way. This makes is possible to character that is to be tested in some way. This makes is possible to
centralize the loading of these characters. In the case of Type * etc, the centralize the loading of these characters. In the case of Type * etc, the
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
small value. ***NOTE*** If the start of this table is modified, the two tables small value. Non-zero values in the table are the offsets from the opcode where
that follow must also be modified. */ the character is to be found. ***NOTE*** If the start of this table is
modified, the three tables that follow must also be modified. */
static const uschar coptable[] = { static const uschar coptable[] = {
0, /* End */ 0, /* End */
0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */ 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */ 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
0, 0, 0, /* Any, AllAny, Anybyte */ 0, 0, 0, /* Any, AllAny, Anybyte */
0, 0, 0, /* NOTPROP, PROP, EXTUNI */ 0, 0, /* \P, \p */
0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */ 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
0, /* \X */
0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */ 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
1, /* Char */ 1, /* Char */
1, /* Charnc */ 1, /* Charnc */
@ -127,12 +157,69 @@ static const uschar coptable[] = {
0, /* Reverse */ 0, /* Reverse */
0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */ 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
0, 0, 0, /* SBRA, SCBRA, SCOND */ 0, 0, 0, /* SBRA, SCBRA, SCOND */
0, /* CREF */ 0, 0, /* CREF, NCREF */
0, /* RREF */ 0, 0, /* RREF, NRREF */
0, /* DEF */ 0, /* DEF */
0, 0, /* BRAZERO, BRAMINZERO */ 0, 0, /* BRAZERO, BRAMINZERO */
0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */ 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */ 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
};
/* This table identifies those opcodes that inspect a character. It is used to
remember the fact that a character could have been inspected when the end of
the subject is reached. ***NOTE*** If the start of this table is modified, the
two tables that follow must also be modified. */
static const uschar poptable[] = {
0, /* End */
0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
1, 1, 1, /* Any, AllAny, Anybyte */
1, 1, /* \P, \p */
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
1, /* \X */
0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
1, /* Char */
1, /* Charnc */
1, /* not */
/* Positive single-char repeats */
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
1, 1, 1, /* upto, minupto, exact */
1, 1, 1, 1, /* *+, ++, ?+, upto+ */
/* Negative single-char repeats - only for chars < 256 */
1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
1, 1, 1, /* NOT upto, minupto, exact */
1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
/* Positive type repeats */
1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
1, 1, 1, /* Type upto, minupto, exact */
1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
/* Character class & ref repeats */
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
1, 1, /* CRRANGE, CRMINRANGE */
1, /* CLASS */
1, /* NCLASS */
1, /* XCLASS - variable length */
0, /* REF */
0, /* RECURSE */
0, /* CALLOUT */
0, /* Alt */
0, /* Ket */
0, /* KetRmax */
0, /* KetRmin */
0, /* Assert */
0, /* Assert not */
0, /* Assert behind */
0, /* Assert behind not */
0, /* Reverse */
0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
0, 0, 0, /* SBRA, SCBRA, SCOND */
0, 0, /* CREF, NCREF */
0, 0, /* RREF, NRREF */
0, /* DEF */
0, 0, /* BRAZERO, BRAMINZERO */
0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
}; };
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W, /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
@ -170,7 +257,7 @@ typedef struct stateblock {
#define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int)) #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
#ifdef DEBUG #ifdef PCRE_DEBUG
/************************************************* /*************************************************
* Print character string * * Print character string *
*************************************************/ *************************************************/
@ -390,6 +477,11 @@ if (*first_op == OP_REVERSE)
current_subject -= gone_back; current_subject -= gone_back;
} }
/* Save the earliest consulted character */
if (current_subject < md->start_used_ptr)
md->start_used_ptr = current_subject;
/* Now we can process the individual branches. */ /* Now we can process the individual branches. */
end_code = this_start_code; end_code = this_start_code;
@ -454,6 +546,8 @@ for (;;)
int i, j; int i, j;
int clen, dlen; int clen, dlen;
unsigned int c, d; unsigned int c, d;
int forced_fail = 0;
BOOL could_continue = FALSE;
/* Make the new state list into the active state list and empty the /* Make the new state list into the active state list and empty the
new state list. */ new state list. */
@ -467,7 +561,7 @@ for (;;)
workspace[0] ^= 1; /* Remember for the restarting feature */ workspace[0] ^= 1; /* Remember for the restarting feature */
workspace[1] = active_count; workspace[1] = active_count;
#ifdef DEBUG #ifdef PCRE_DEBUG
printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP); printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
pchars((uschar *)ptr, strlen((char *)ptr), stdout); pchars((uschar *)ptr, strlen((char *)ptr), stdout);
printf("\"\n"); printf("\"\n");
@ -511,9 +605,9 @@ for (;;)
stateblock *current_state = active_states + i; stateblock *current_state = active_states + i;
const uschar *code; const uschar *code;
int state_offset = current_state->offset; int state_offset = current_state->offset;
int count, codevalue; int count, codevalue, rrc;
#ifdef DEBUG #ifdef PCRE_DEBUG
printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset); printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
if (clen == 0) printf("EOL\n"); if (clen == 0) printf("EOL\n");
else if (c > 32 && c < 127) printf("'%c'\n", c); else if (c > 32 && c < 127) printf("'%c'\n", c);
@ -543,7 +637,9 @@ for (;;)
} }
} }
/* Check for a duplicate state with the same count, and skip if found. */ /* Check for a duplicate state with the same count, and skip if found.
See the note at the head of this module about the possibility of improving
performance here. */
for (j = 0; j < i; j++) for (j = 0; j < i; j++)
{ {
@ -560,6 +656,12 @@ for (;;)
code = start_code + state_offset; code = start_code + state_offset;
codevalue = *code; codevalue = *code;
/* If this opcode inspects a character, but we are at the end of the
subject, remember the fact for use when testing for a partial match. */
if (clen == 0 && poptable[codevalue] != 0)
could_continue = TRUE;
/* If this opcode is followed by an inline character, load it. It is /* If this opcode is followed by an inline character, load it. It is
tempting to test for the presence of a subject character here, but that tempting to test for the presence of a subject character here, but that
is wrong, because sometimes zero repetitions of the subject are is wrong, because sometimes zero repetitions of the subject are
@ -606,11 +708,24 @@ for (;;)
switch (codevalue) switch (codevalue)
{ {
/* ========================================================================== */
/* These cases are never obeyed. This is a fudge that causes a compile-
time error if the vectors coptable or poptable, which are indexed by
opcode, are not the correct length. It seems to be the only way to do
such a check at compile time, as the sizeof() operator does not work
in the C preprocessor. */
case OP_TABLE_LENGTH:
case OP_TABLE_LENGTH +
((sizeof(coptable) == OP_TABLE_LENGTH) &&
(sizeof(poptable) == OP_TABLE_LENGTH)):
break;
/* ========================================================================== */ /* ========================================================================== */
/* Reached a closing bracket. If not at the end of the pattern, carry /* Reached a closing bracket. If not at the end of the pattern, carry
on with the next opcode. Otherwise, unless we have an empty string and on with the next opcode. Otherwise, unless we have an empty string and
PCRE_NOTEMPTY is set, save the match data, shifting up all previous PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
start of the subject, save the match data, shifting up all previous
matches so we always have the longest first. */ matches so we always have the longest first. */
case OP_KET: case OP_KET:
@ -624,26 +739,32 @@ for (;;)
ADD_ACTIVE(state_offset - GET(code, 1), 0); ADD_ACTIVE(state_offset - GET(code, 1), 0);
} }
} }
else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0) else
{ {
if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0; if (ptr > current_subject ||
else if (match_count > 0 && ++match_count * 2 >= offsetcount) ((md->moptions & PCRE_NOTEMPTY) == 0 &&
match_count = 0; ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
count = ((match_count == 0)? offsetcount : match_count * 2) - 2; current_subject > start_subject + md->start_offset)))
if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
if (offsetcount >= 2)
{ {
offsets[0] = current_subject - start_subject; if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
offsets[1] = ptr - start_subject; else if (match_count > 0 && ++match_count * 2 >= offsetcount)
DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP, match_count = 0;
offsets[1] - offsets[0], current_subject)); count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
} if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
if ((md->moptions & PCRE_DFA_SHORTEST) != 0) if (offsetcount >= 2)
{ {
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n" offsets[0] = current_subject - start_subject;
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, offsets[1] = ptr - start_subject;
match_count, rlevel*2-2, SP)); DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
return match_count; offsets[1] - offsets[0], current_subject));
}
if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
{
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
match_count, rlevel*2-2, SP));
return match_count;
}
} }
} }
break; break;
@ -757,7 +878,7 @@ for (;;)
if ((md->moptions & PCRE_NOTEOL) == 0) if ((md->moptions & PCRE_NOTEOL) == 0)
{ {
if (clen == 0 || if (clen == 0 ||
(IS_NEWLINE(ptr) && ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen) ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
)) ))
{ ADD_ACTIVE(state_offset + 1, 0); } { ADD_ACTIVE(state_offset + 1, 0); }
@ -794,6 +915,7 @@ for (;;)
if (ptr > start_subject) if (ptr > start_subject)
{ {
const uschar *temp = ptr - 1; const uschar *temp = ptr - 1;
if (temp < md->start_used_ptr) md->start_used_ptr = temp;
#ifdef SUPPORT_UTF8 #ifdef SUPPORT_UTF8
if (utf8) BACKCHAR(temp); if (utf8) BACKCHAR(temp);
#endif #endif
@ -802,8 +924,9 @@ for (;;)
} }
else left_word = 0; else left_word = 0;
if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0; if (clen > 0)
else right_word = 0; right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
else right_word = 0;
if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY)) if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
{ ADD_ACTIVE(state_offset + 1, 0); } { ADD_ACTIVE(state_offset + 1, 0); }
@ -2157,11 +2280,12 @@ for (;;)
/* ========================================================================== */ /* ========================================================================== */
/* These are the opcodes for fancy brackets of various kinds. We have /* These are the opcodes for fancy brackets of various kinds. We have
to use recursion in order to handle them. The "always failing" assersion to use recursion in order to handle them. The "always failing" assertion
(?!) is optimised when compiling to OP_FAIL, so we have to support that, (?!) is optimised to OP_FAIL when compiling, so we have to support that,
though the other "backtracking verbs" are not supported. */ though the other "backtracking verbs" are not supported. */
case OP_FAIL: case OP_FAIL:
forced_fail++; /* Count FAILs for multiple states */
break; break;
case OP_ASSERT: case OP_ASSERT:
@ -2189,6 +2313,7 @@ for (;;)
rlevel, /* function recursion level */ rlevel, /* function recursion level */
recursing); /* pass on regex recursion */ recursing); /* pass on regex recursion */
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK)) if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
{ ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); } { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
} }
@ -2200,29 +2325,60 @@ for (;;)
{ {
int local_offsets[1000]; int local_offsets[1000];
int local_workspace[1000]; int local_workspace[1000];
int condcode = code[LINK_SIZE+1]; int codelink = GET(code, 1);
int condcode;
/* Because of the way auto-callout works during compile, a callout item
is inserted between OP_COND and an assertion condition. This does not
happen for the other conditions. */
if (code[LINK_SIZE+1] == OP_CALLOUT)
{
rrc = 0;
if (pcre_callout != NULL)
{
pcre_callout_block cb;
cb.version = 1; /* Version 1 of the callout block */
cb.callout_number = code[LINK_SIZE+2];
cb.offset_vector = offsets;
cb.subject = (PCRE_SPTR)start_subject;
cb.subject_length = end_subject - start_subject;
cb.start_match = current_subject - start_subject;
cb.current_position = ptr - start_subject;
cb.pattern_position = GET(code, LINK_SIZE + 3);
cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
cb.capture_top = 1;
cb.capture_last = -1;
cb.callout_data = md->callout_data;
if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
}
if (rrc > 0) break; /* Fail this thread */
code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
}
condcode = code[LINK_SIZE+1];
/* Back reference conditions are not supported */ /* Back reference conditions are not supported */
if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND; if (condcode == OP_CREF || condcode == OP_NCREF)
return PCRE_ERROR_DFA_UCOND;
/* The DEFINE condition is always false */ /* The DEFINE condition is always false */
if (condcode == OP_DEF) if (condcode == OP_DEF)
{ { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
}
/* The only supported version of OP_RREF is for the value RREF_ANY, /* The only supported version of OP_RREF is for the value RREF_ANY,
which means "test if in any recursion". We can't test for specifically which means "test if in any recursion". We can't test for specifically
recursed groups. */ recursed groups. */
else if (condcode == OP_RREF) else if (condcode == OP_RREF || condcode == OP_NRREF)
{ {
int value = GET2(code, LINK_SIZE+2); int value = GET2(code, LINK_SIZE+2);
if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND; if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); } if (recursing > 0)
else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); } { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
} }
/* Otherwise, the condition is an assertion */ /* Otherwise, the condition is an assertion */
@ -2248,11 +2404,12 @@ for (;;)
rlevel, /* function recursion level */ rlevel, /* function recursion level */
recursing); /* pass on regex recursion */ recursing); /* pass on regex recursion */
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
if ((rc >= 0) == if ((rc >= 0) ==
(condcode == OP_ASSERT || condcode == OP_ASSERTBACK)) (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
{ ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); } { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
else else
{ ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); } { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
} }
} }
break; break;
@ -2404,9 +2561,9 @@ for (;;)
/* Handle callouts */ /* Handle callouts */
case OP_CALLOUT: case OP_CALLOUT:
rrc = 0;
if (pcre_callout != NULL) if (pcre_callout != NULL)
{ {
int rrc;
pcre_callout_block cb; pcre_callout_block cb;
cb.version = 1; /* Version 1 of the callout block */ cb.version = 1; /* Version 1 of the callout block */
cb.callout_number = code[1]; cb.callout_number = code[1];
@ -2421,8 +2578,9 @@ for (;;)
cb.capture_last = -1; cb.capture_last = -1;
cb.callout_data = md->callout_data; cb.callout_data = md->callout_data;
if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */ if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
} }
if (rrc == 0)
{ ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
break; break;
@ -2438,19 +2596,33 @@ for (;;)
/* We have finished the processing at the current subject character. If no /* We have finished the processing at the current subject character. If no
new states have been set for the next character, we have found all the new states have been set for the next character, we have found all the
matches that we are going to find. If we are at the top level and partial matches that we are going to find. If we are at the top level and partial
matching has been requested, check for appropriate conditions. */ matching has been requested, check for appropriate conditions.
The "forced_ fail" variable counts the number of (*F) encountered for the
character. If it is equal to the original active_count (saved in
workspace[1]) it means that (*F) was found on every active state. In this
case we don't want to give a partial match.
The "could_continue" variable is true if a state could have continued but
for the fact that the end of the subject was reached. */
if (new_count <= 0) if (new_count <= 0)
{ {
if (match_count < 0 && /* No matches found */ if (rlevel == 1 && /* Top level, and */
rlevel == 1 && /* Top level match function */ could_continue && /* Some could go on */
(md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */ forced_fail != workspace[1] && /* Not all forced fail & */
ptr >= end_subject && /* Reached end of subject */ ( /* either... */
ptr > current_subject) /* Matched non-empty string */ (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
|| /* or... */
((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
match_count < 0) /* no matches */
) && /* And... */
ptr >= end_subject && /* Reached end of subject */
ptr > current_subject) /* Matched non-empty string */
{ {
if (offsetcount >= 2) if (offsetcount >= 2)
{ {
offsets[0] = current_subject - start_subject; offsets[0] = md->start_used_ptr - start_subject;
offsets[1] = end_subject - start_subject; offsets[1] = end_subject - start_subject;
} }
match_count = PCRE_ERROR_PARTIAL; match_count = PCRE_ERROR_PARTIAL;
@ -2592,6 +2764,7 @@ md->start_code = (const uschar *)argument_re +
re->name_table_offset + re->name_count * re->name_entry_size; re->name_table_offset + re->name_count * re->name_entry_size;
md->start_subject = (const unsigned char *)subject; md->start_subject = (const unsigned char *)subject;
md->end_subject = end_subject; md->end_subject = end_subject;
md->start_offset = start_offset;
md->moptions = options; md->moptions = options;
md->poptions = re->options; md->poptions = re->options;
@ -2614,10 +2787,10 @@ switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)option
PCRE_NEWLINE_BITS) PCRE_NEWLINE_BITS)
{ {
case 0: newline = NEWLINE; break; /* Compile-time default */ case 0: newline = NEWLINE; break; /* Compile-time default */
case PCRE_NEWLINE_CR: newline = '\r'; break; case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
case PCRE_NEWLINE_LF: newline = '\n'; break; case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
case PCRE_NEWLINE_CR+ case PCRE_NEWLINE_CR+
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break; PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
case PCRE_NEWLINE_ANY: newline = -1; break; case PCRE_NEWLINE_ANY: newline = -1; break;
case PCRE_NEWLINE_ANYCRLF: newline = -2; break; case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
default: return PCRE_ERROR_BADNEWLINE; default: return PCRE_ERROR_BADNEWLINE;
@ -2696,8 +2869,8 @@ if (!anchored)
} }
else else
{ {
if (startline && study != NULL && if (!startline && study != NULL &&
(study->options & PCRE_STUDY_MAPPED) != 0) (study->flags & PCRE_STUDY_MAPPED) != 0)
start_bits = study->start_bits; start_bits = study->start_bits;
} }
} }
@ -2713,9 +2886,8 @@ if ((re->flags & PCRE_REQCHSET) != 0)
} }
/* Call the main matching function, looping for a non-anchored regex after a /* Call the main matching function, looping for a non-anchored regex after a
failed match. Unless restarting, optimize by moving to the first match failed match. If not restarting, perform certain optimizations at the start of
character if possible, when not anchored. Then unless wanting a partial match, a match. */
check for a required later character. */
for (;;) for (;;)
{ {
@ -2725,11 +2897,10 @@ for (;;)
{ {
const uschar *save_end_subject = end_subject; const uschar *save_end_subject = end_subject;
/* Advance to a unique first char if possible. If firstline is TRUE, the /* If firstline is TRUE, the start of the match is constrained to the first
start of the match is constrained to the first line of a multiline string. line of a multiline string. Implement this by temporarily adjusting
Implement this by temporarily adjusting end_subject so that we stop end_subject so that we stop scanning at a newline. If the match fails at
scanning at a newline. If the match fails at the newline, later code breaks the newline, later code breaks this loop. */
this loop. */
if (firstline) if (firstline)
{ {
@ -2749,126 +2920,151 @@ for (;;)
end_subject = t; end_subject = t;
} }
if (first_byte >= 0) /* There are some optimizations that avoid running the match if a known
{ starting point is not found. However, there is an option that disables
if (first_byte_caseless) these, for testing and for ensuring that all callouts do actually occur. */
while (current_subject < end_subject &&
lcc[*current_subject] != first_byte)
current_subject++;
else
while (current_subject < end_subject && *current_subject != first_byte)
current_subject++;
}
/* Or to just after a linebreak for a multiline match if possible */ if ((options & PCRE_NO_START_OPTIMIZE) == 0)
else if (startline)
{ {
if (current_subject > md->start_subject + start_offset) /* Advance to a known first byte. */
if (first_byte >= 0)
{ {
#ifdef SUPPORT_UTF8 if (first_byte_caseless)
if (utf8) while (current_subject < end_subject &&
{ lcc[*current_subject] != first_byte)
while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
{
current_subject++; current_subject++;
while(current_subject < end_subject &&
(*current_subject & 0xc0) == 0x80)
current_subject++;
}
}
else else
#endif while (current_subject < end_subject &&
while (current_subject < end_subject && !WAS_NEWLINE(current_subject)) *current_subject != first_byte)
current_subject++; current_subject++;
/* If we have just passed a CR and the newline option is ANY or
ANYCRLF, and we are now at a LF, advance the match position by one more
character. */
if (current_subject[-1] == '\r' &&
(md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
current_subject < end_subject &&
*current_subject == '\n')
current_subject++;
} }
}
/* Or to a non-unique first char after study */ /* Or to just after a linebreak for a multiline match if possible */
else if (start_bits != NULL) else if (startline)
{
while (current_subject < end_subject)
{ {
register unsigned int c = *current_subject; if (current_subject > md->start_subject + start_offset)
if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++; {
else break; #ifdef SUPPORT_UTF8
if (utf8)
{
while (current_subject < end_subject &&
!WAS_NEWLINE(current_subject))
{
current_subject++;
while(current_subject < end_subject &&
(*current_subject & 0xc0) == 0x80)
current_subject++;
}
}
else
#endif
while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
current_subject++;
/* If we have just passed a CR and the newline option is ANY or
ANYCRLF, and we are now at a LF, advance the match position by one
more character. */
if (current_subject[-1] == CHAR_CR &&
(md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
current_subject < end_subject &&
*current_subject == CHAR_NL)
current_subject++;
}
}
/* Or to a non-unique first char after study */
else if (start_bits != NULL)
{
while (current_subject < end_subject)
{
register unsigned int c = *current_subject;
if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
else break;
}
} }
} }
/* Restore fudged end_subject */ /* Restore fudged end_subject */
end_subject = save_end_subject; end_subject = save_end_subject;
}
/* If req_byte is set, we know that that character must appear in the subject /* The following two optimizations are disabled for partial matching or if
for the match to succeed. If the first character is set, req_byte must be disabling is explicitly requested (and of course, by the test above, this
later in the subject; otherwise the test starts at the match point. This code is not obeyed when restarting after a partial match). */
optimization can save a huge amount of work in patterns with nested unlimited
repeats that aren't going to match. Writing separate code for cased/caseless
versions makes it go faster, as does using an autoincrement and backing off
on a match.
HOWEVER: when the subject string is very, very long, searching to its end can if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
take a long time, and give bad performance on quite ordinary patterns. This (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
showed up when somebody was matching /^C/ on a 32-megabyte string... so we
don't do this when the string is sufficiently long.
ALSO: this processing is disabled when partial matching is requested.
*/
if (req_byte >= 0 &&
end_subject - current_subject < REQ_BYTE_MAX &&
(options & PCRE_PARTIAL) == 0)
{
register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
/* We don't need to repeat the search if we haven't yet reached the
place we found it at last time. */
if (p > req_byte_ptr)
{ {
if (req_byte_caseless) /* If the pattern was studied, a minimum subject length may be set. This
is a lower bound; no actual string of that length may actually match the
pattern. Although the value is, strictly, in characters, we treat it as
bytes to avoid spending too much time in this optimization. */
if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
(pcre_uint32)(end_subject - current_subject) < study->minlength)
return PCRE_ERROR_NOMATCH;
/* If req_byte is set, we know that that character must appear in the
subject for the match to succeed. If the first character is set, req_byte
must be later in the subject; otherwise the test starts at the match
point. This optimization can save a huge amount of work in patterns with
nested unlimited repeats that aren't going to match. Writing separate
code for cased/caseless versions makes it go faster, as does using an
autoincrement and backing off on a match.
HOWEVER: when the subject string is very, very long, searching to its end
can take a long time, and give bad performance on quite ordinary
patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
string... so we don't do this when the string is sufficiently long. */
if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
{ {
while (p < end_subject) register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
/* We don't need to repeat the search if we haven't yet reached the
place we found it at last time. */
if (p > req_byte_ptr)
{ {
register int pp = *p++; if (req_byte_caseless)
if (pp == req_byte || pp == req_byte2) { p--; break; } {
while (p < end_subject)
{
register int pp = *p++;
if (pp == req_byte || pp == req_byte2) { p--; break; }
}
}
else
{
while (p < end_subject)
{
if (*p++ == req_byte) { p--; break; }
}
}
/* If we can't find the required character, break the matching loop,
which will cause a return or PCRE_ERROR_NOMATCH. */
if (p >= end_subject) break;
/* If we have found the required character, save the point where we
found it, so that we don't search again next time round the loop if
the start hasn't passed this character yet. */
req_byte_ptr = p;
} }
} }
else
{
while (p < end_subject)
{
if (*p++ == req_byte) { p--; break; }
}
}
/* If we can't find the required character, break the matching loop,
which will cause a return or PCRE_ERROR_NOMATCH. */
if (p >= end_subject) break;
/* If we have found the required character, save the point where we
found it, so that we don't search again next time round the loop if
the start hasn't passed this character yet. */
req_byte_ptr = p;
} }
} } /* End of optimizations that are done when not restarting */
/* OK, now we can do the business */ /* OK, now we can do the business */
md->start_used_ptr = current_subject;
rc = internal_dfa_exec( rc = internal_dfa_exec(
md, /* fixed match data */ md, /* fixed match data */
md->start_code, /* this subexpression's code */ md->start_code, /* this subexpression's code */
@ -2903,9 +3099,9 @@ for (;;)
not contain any explicit matches for \r or \n, and the newline option is CRLF not contain any explicit matches for \r or \n, and the newline option is CRLF
or ANY or ANYCRLF, advance the match position by one more character. */ or ANY or ANYCRLF, advance the match position by one more character. */
if (current_subject[-1] == '\r' && if (current_subject[-1] == CHAR_CR &&
current_subject < end_subject && current_subject < end_subject &&
*current_subject == '\n' && *current_subject == CHAR_NL &&
(re->flags & PCRE_HASCRORLF) == 0 && (re->flags & PCRE_HASCRORLF) == 0 &&
(md->nltype == NLTYPE_ANY || (md->nltype == NLTYPE_ANY ||
md->nltype == NLTYPE_ANYCRLF || md->nltype == NLTYPE_ANYCRLF ||

File diff suppressed because it is too large Load Diff

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language. and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Copyright (c) 1997-2008 University of Cambridge Copyright (c) 1997-2009 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -89,7 +89,7 @@ if (re->magic_number != MAGIC_NUMBER)
switch (what) switch (what)
{ {
case PCRE_INFO_OPTIONS: case PCRE_INFO_OPTIONS:
*((unsigned long int *)where) = re->options & PUBLIC_OPTIONS; *((unsigned long int *)where) = re->options & PUBLIC_COMPILE_OPTIONS;
break; break;
case PCRE_INFO_SIZE: case PCRE_INFO_SIZE:
@ -119,10 +119,16 @@ switch (what)
case PCRE_INFO_FIRSTTABLE: case PCRE_INFO_FIRSTTABLE:
*((const uschar **)where) = *((const uschar **)where) =
(study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)? (study != NULL && (study->flags & PCRE_STUDY_MAPPED) != 0)?
((const pcre_study_data *)extra_data->study_data)->start_bits : NULL; ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
break; break;
case PCRE_INFO_MINLENGTH:
*((int *)where) =
(study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0)?
study->minlength : -1;
break;
case PCRE_INFO_LASTLITERAL: case PCRE_INFO_LASTLITERAL:
*((int *)where) = *((int *)where) =
((re->flags & PCRE_REQCHSET) != 0)? re->req_byte : -1; ((re->flags & PCRE_REQCHSET) != 0)? re->req_byte : -1;
@ -144,6 +150,9 @@ switch (what)
*((const uschar **)where) = (const uschar *)(_pcre_default_tables); *((const uschar **)where) = (const uschar *)(_pcre_default_tables);
break; break;
/* From release 8.00 this will always return TRUE because NOPARTIAL is
no longer ever set (the restrictions have been removed). */
case PCRE_INFO_OKPARTIAL: case PCRE_INFO_OKPARTIAL:
*((int *)where) = (re->flags & PCRE_NOPARTIAL) == 0; *((int *)where) = (re->flags & PCRE_NOPARTIAL) == 0;
break; break;

View File

@ -43,8 +43,14 @@ PCRE is thread-clean and doesn't use any global variables in the normal sense.
However, it calls memory allocation and freeing functions via the four However, it calls memory allocation and freeing functions via the four
indirections below, and it can optionally do callouts, using the fifth indirections below, and it can optionally do callouts, using the fifth
indirection. These values can be changed by the caller, but are shared between indirection. These values can be changed by the caller, but are shared between
all threads. However, when compiling for Virtual Pascal, things are done all threads.
differently, and global variables are not used (see pcre.in). */
For MS Visual Studio and Symbian OS, there are problems in initializing these
variables to non-local functions. In these cases, therefore, an indirection via
a local function is used.
Also, when compiling for Virtual Pascal, things are done differently, and
global variables are not used. */
#ifdef HAVE_CONFIG_H #ifdef HAVE_CONFIG_H
#include "config.h" #include "config.h"
@ -52,6 +58,19 @@ differently, and global variables are not used (see pcre.in). */
#include "pcre_internal.h" #include "pcre_internal.h"
#if defined _MSC_VER || defined __SYMBIAN32__
static void* LocalPcreMalloc(size_t aSize)
{
return malloc(aSize);
}
static void LocalPcreFree(void* aPtr)
{
free(aPtr);
}
PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL; PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL;
#elif !defined VPCOMPAT
PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL;
#endif
/* End of pcre_globals.c */ /* End of pcre_globals.c */

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language. and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Copyright (c) 1997-2008 University of Cambridge Copyright (c) 1997-2009 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -83,7 +83,7 @@ if (re->magic_number != MAGIC_NUMBER)
re = _pcre_try_flipped(re, &internal_re, NULL, NULL); re = _pcre_try_flipped(re, &internal_re, NULL, NULL);
if (re == NULL) return PCRE_ERROR_BADMAGIC; if (re == NULL) return PCRE_ERROR_BADMAGIC;
} }
if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS); if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_COMPILE_OPTIONS);
if (first_byte != NULL) if (first_byte != NULL)
*first_byte = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte : *first_byte = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte :
((re->flags & PCRE_STARTLINE) != 0)? -1 : -2; ((re->flags & PCRE_STARTLINE) != 0)? -1 : -2;

View File

@ -7,7 +7,7 @@
and semantics are as close as possible to those of the Perl 5 language. and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Copyright (c) 1997-2008 University of Cambridge Copyright (c) 1997-2010 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -45,10 +45,24 @@ functions whose names all begin with "_pcre_". */
#ifndef PCRE_INTERNAL_H #ifndef PCRE_INTERNAL_H
#define PCRE_INTERNAL_H #define PCRE_INTERNAL_H
/* Define DEBUG to get debugging output on stdout. */ /* Define PCRE_DEBUG to get debugging output on stdout. */
#if 0 #if 0
#define DEBUG #define PCRE_DEBUG
#endif
/* We do not support both EBCDIC and UTF-8 at the same time. The "configure"
script prevents both being selected, but not everybody uses "configure". */
#if defined EBCDIC && defined SUPPORT_UTF8
#error The use of both EBCDIC and SUPPORT_UTF8 is not supported.
#endif
/* If SUPPORT_UCP is defined, SUPPORT_UTF8 must also be defined. The
"configure" script ensures this, but not everybody uses "configure". */
#if defined SUPPORT_UCP && !defined SUPPORT_UTF8
#define SUPPORT_UTF8 1
#endif #endif
/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
@ -60,7 +74,7 @@ It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so
be absolutely sure we get our version. */ be absolutely sure we get our version. */
#undef DPRINTF #undef DPRINTF
#ifdef DEBUG #ifdef PCRE_DEBUG
#define DPRINTF(p) printf p #define DPRINTF(p) printf p
#else #else
#define DPRINTF(p) /* Nothing */ #define DPRINTF(p) /* Nothing */
@ -72,8 +86,6 @@ setjmp and stdarg are used is when NO_RECURSE is set. */
#include <ctype.h> #include <ctype.h>
#include <limits.h> #include <limits.h>
#include <setjmp.h>
#include <stdarg.h>
#include <stddef.h> #include <stddef.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
@ -172,6 +184,26 @@ preprocessor time in standard C environments. */
#error Cannot determine a type for 32-bit unsigned integers #error Cannot determine a type for 32-bit unsigned integers
#endif #endif
/* When checking for integer overflow in pcre_compile(), we need to handle
large integers. If a 64-bit integer type is available, we can use that.
Otherwise we have to cast to double, which of course requires floating point
arithmetic. Handle this by defining a macro for the appropriate type. If
stdint.h is available, include it; it may define INT64_MAX. Systems that do not
have stdint.h (e.g. Solaris) may have inttypes.h. The macro int64_t may be set
by "configure". */
#if HAVE_STDINT_H
#include <stdint.h>
#elif HAVE_INTTYPES_H
#include <inttypes.h>
#endif
#if defined INT64_MAX || defined int64_t
#define INT64_OR_DOUBLE int64_t
#else
#define INT64_OR_DOUBLE double
#endif
/* All character handling must be done as unsigned characters. Otherwise there /* All character handling must be done as unsigned characters. Otherwise there
are problems with top-bit-set characters and functions such as isspace(). are problems with top-bit-set characters and functions such as isspace().
However, we leave the interface to the outside world as char *, because that However, we leave the interface to the outside world as char *, because that
@ -259,6 +291,7 @@ option on the command line. */
#define strncmp(s1,s2,m) _strncmp(s1,s2,m) #define strncmp(s1,s2,m) _strncmp(s1,s2,m)
#define memcmp(s,c,n) _memcmp(s,c,n) #define memcmp(s,c,n) _memcmp(s,c,n)
#define memcpy(d,s,n) _memcpy(d,s,n) #define memcpy(d,s,n) _memcpy(d,s,n)
#define memmove(d,s,n) _memmove(d,s,n)
#define memset(s,c,n) _memset(s,c,n) #define memset(s,c,n) _memset(s,c,n)
#else /* VPCOMPAT */ #else /* VPCOMPAT */
@ -477,6 +510,26 @@ if there are extra bytes. This is called when we know we are in UTF-8 mode. */
len += gcaa; \ len += gcaa; \
} }
/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
pointer, incrementing length if there are extra bytes. This is called when we
know we are in UTF-8 mode. */
#define GETCHARLENTEST(c, eptr, len) \
c = *eptr; \
if (utf8 && c >= 0xc0) \
{ \
int gcii; \
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
int gcss = 6*gcaa; \
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
for (gcii = 1; gcii <= gcaa; gcii++) \
{ \
gcss -= 6; \
c |= (eptr[gcii] & 0x3f) << gcss; \
} \
len += gcaa; \
}
/* If the pointer is not at the start of a character, move it back until /* If the pointer is not at the start of a character, move it back until
it is. This is called only in UTF-8 mode - we don't put a test within the macro it is. This is called only in UTF-8 mode - we don't put a test within the macro
because almost all calls are already within a block of UTF-8 only code. */ because almost all calls are already within a block of UTF-8 only code. */
@ -500,7 +553,9 @@ Standard C system should have one. */
/* Private flags containing information about the compiled regex. They used to /* Private flags containing information about the compiled regex. They used to
live at the top end of the options word, but that got almost full, so now they live at the top end of the options word, but that got almost full, so now they
are in a 16-bit flags word. */ are in a 16-bit flags word. From release 8.00, PCRE_NOPARTIAL is unused, as
the restrictions on partial matching have been lifted. It remains for backwards
compatibility. */
#define PCRE_NOPARTIAL 0x0001 /* can't use partial with this regex */ #define PCRE_NOPARTIAL 0x0001 /* can't use partial with this regex */
#define PCRE_FIRSTSET 0x0002 /* first_byte is set */ #define PCRE_FIRSTSET 0x0002 /* first_byte is set */
@ -512,6 +567,7 @@ are in a 16-bit flags word. */
/* Options for the "extra" block produced by pcre_study(). */ /* Options for the "extra" block produced by pcre_study(). */
#define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */ #define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */
#define PCRE_STUDY_MINLEN 0x02 /* a minimum length field exists */
/* Masks for identifying the public options that are permitted at compile /* Masks for identifying the public options that are permitted at compile
time, run time, or study time, respectively. */ time, run time, or study time, respectively. */
@ -519,7 +575,7 @@ time, run time, or study time, respectively. */
#define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY| \ #define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY| \
PCRE_NEWLINE_ANYCRLF) PCRE_NEWLINE_ANYCRLF)
#define PUBLIC_OPTIONS \ #define PUBLIC_COMPILE_OPTIONS \
(PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \ (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \ PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \ PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
@ -527,13 +583,15 @@ time, run time, or study time, respectively. */
PCRE_JAVASCRIPT_COMPAT) PCRE_JAVASCRIPT_COMPAT)
#define PUBLIC_EXEC_OPTIONS \ #define PUBLIC_EXEC_OPTIONS \
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
PCRE_PARTIAL|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE) PCRE_NO_UTF8_CHECK|PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_NEWLINE_BITS| \
PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE|PCRE_NO_START_OPTIMIZE)
#define PUBLIC_DFA_EXEC_OPTIONS \ #define PUBLIC_DFA_EXEC_OPTIONS \
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \ (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS| \ PCRE_NO_UTF8_CHECK|PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_SHORTEST| \
PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE) PCRE_DFA_RESTART|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
PCRE_NO_START_OPTIMIZE)
#define PUBLIC_STUDY_OPTIONS 0 /* None defined */ #define PUBLIC_STUDY_OPTIONS 0 /* None defined */
@ -559,33 +617,566 @@ variable-length repeat, or a anything other than literal characters. */
#define REQ_VARY 0x0200 /* reqbyte followed non-literal item */ #define REQ_VARY 0x0200 /* reqbyte followed non-literal item */
/* Miscellaneous definitions. The #ifndef is to pacify compiler warnings in /* Miscellaneous definitions. The #ifndef is to pacify compiler warnings in
environments where these macros are defined elsewhere. */ environments where these macros are defined elsewhere. Unfortunately, there
is no way to do the same for the typedef. */
typedef gboolean BOOL; typedef gboolean BOOL;
/* If PCRE is to support UTF-8 on EBCDIC platforms, we cannot use normal
character constants like '*' because the compiler would emit their EBCDIC code,
which is different from their ASCII/UTF-8 code. Instead we define macros for
the characters so that they always use the ASCII/UTF-8 code when UTF-8 support
is enabled. When UTF-8 support is not enabled, the definitions use character
literals. Both character and string versions of each character are needed, and
there are some longer strings as well.
This means that, on EBCDIC platforms, the PCRE library can handle either
EBCDIC, or UTF-8, but not both. To support both in the same compiled library
would need different lookups depending on whether PCRE_UTF8 was set or not.
This would make it impossible to use characters in switch/case statements,
which would reduce performance. For a theoretical use (which nobody has asked
for) in a minority area (EBCDIC platforms), this is not sensible. Any
application that did need both could compile two versions of the library, using
macros to give the functions distinct names. */
#ifndef SUPPORT_UTF8
/* UTF-8 support is not enabled; use the platform-dependent character literals
so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
#define CHAR_HT '\t'
#define CHAR_VT '\v'
#define CHAR_FF '\f'
#define CHAR_CR '\r'
#define CHAR_NL '\n'
#define CHAR_BS '\b'
#define CHAR_BEL '\a'
#ifdef EBCDIC
#define CHAR_ESC '\047'
#define CHAR_DEL '\007'
#else
#define CHAR_ESC '\033'
#define CHAR_DEL '\177'
#endif
#define CHAR_SPACE ' '
#define CHAR_EXCLAMATION_MARK '!'
#define CHAR_QUOTATION_MARK '"'
#define CHAR_NUMBER_SIGN '#'
#define CHAR_DOLLAR_SIGN '$'
#define CHAR_PERCENT_SIGN '%'
#define CHAR_AMPERSAND '&'
#define CHAR_APOSTROPHE '\''
#define CHAR_LEFT_PARENTHESIS '('
#define CHAR_RIGHT_PARENTHESIS ')'
#define CHAR_ASTERISK '*'
#define CHAR_PLUS '+'
#define CHAR_COMMA ','
#define CHAR_MINUS '-'
#define CHAR_DOT '.'
#define CHAR_SLASH '/'
#define CHAR_0 '0'
#define CHAR_1 '1'
#define CHAR_2 '2'
#define CHAR_3 '3'
#define CHAR_4 '4'
#define CHAR_5 '5'
#define CHAR_6 '6'
#define CHAR_7 '7'
#define CHAR_8 '8'
#define CHAR_9 '9'
#define CHAR_COLON ':'
#define CHAR_SEMICOLON ';'
#define CHAR_LESS_THAN_SIGN '<'
#define CHAR_EQUALS_SIGN '='
#define CHAR_GREATER_THAN_SIGN '>'
#define CHAR_QUESTION_MARK '?'
#define CHAR_COMMERCIAL_AT '@'
#define CHAR_A 'A'
#define CHAR_B 'B'
#define CHAR_C 'C'
#define CHAR_D 'D'
#define CHAR_E 'E'
#define CHAR_F 'F'
#define CHAR_G 'G'
#define CHAR_H 'H'
#define CHAR_I 'I'
#define CHAR_J 'J'
#define CHAR_K 'K'
#define CHAR_L 'L'
#define CHAR_M 'M'
#define CHAR_N 'N'
#define CHAR_O 'O'
#define CHAR_P 'P'
#define CHAR_Q 'Q'
#define CHAR_R 'R'
#define CHAR_S 'S'
#define CHAR_T 'T'
#define CHAR_U 'U'
#define CHAR_V 'V'
#define CHAR_W 'W'
#define CHAR_X 'X'
#define CHAR_Y 'Y'
#define CHAR_Z 'Z'
#define CHAR_LEFT_SQUARE_BRACKET '['
#define CHAR_BACKSLASH '\\'
#define CHAR_RIGHT_SQUARE_BRACKET ']'
#define CHAR_CIRCUMFLEX_ACCENT '^'
#define CHAR_UNDERSCORE '_'
#define CHAR_GRAVE_ACCENT '`'
#define CHAR_a 'a'
#define CHAR_b 'b'
#define CHAR_c 'c'
#define CHAR_d 'd'
#define CHAR_e 'e'
#define CHAR_f 'f'
#define CHAR_g 'g'
#define CHAR_h 'h'
#define CHAR_i 'i'
#define CHAR_j 'j'
#define CHAR_k 'k'
#define CHAR_l 'l'
#define CHAR_m 'm'
#define CHAR_n 'n'
#define CHAR_o 'o'
#define CHAR_p 'p'
#define CHAR_q 'q'
#define CHAR_r 'r'
#define CHAR_s 's'
#define CHAR_t 't'
#define CHAR_u 'u'
#define CHAR_v 'v'
#define CHAR_w 'w'
#define CHAR_x 'x'
#define CHAR_y 'y'
#define CHAR_z 'z'
#define CHAR_LEFT_CURLY_BRACKET '{'
#define CHAR_VERTICAL_LINE '|'
#define CHAR_RIGHT_CURLY_BRACKET '}'
#define CHAR_TILDE '~'
#define STR_HT "\t"
#define STR_VT "\v"
#define STR_FF "\f"
#define STR_CR "\r"
#define STR_NL "\n"
#define STR_BS "\b"
#define STR_BEL "\a"
#ifdef EBCDIC
#define STR_ESC "\047"
#define STR_DEL "\007"
#else
#define STR_ESC "\033"
#define STR_DEL "\177"
#endif
#define STR_SPACE " "
#define STR_EXCLAMATION_MARK "!"
#define STR_QUOTATION_MARK "\""
#define STR_NUMBER_SIGN "#"
#define STR_DOLLAR_SIGN "$"
#define STR_PERCENT_SIGN "%"
#define STR_AMPERSAND "&"
#define STR_APOSTROPHE "'"
#define STR_LEFT_PARENTHESIS "("
#define STR_RIGHT_PARENTHESIS ")"
#define STR_ASTERISK "*"
#define STR_PLUS "+"
#define STR_COMMA ","
#define STR_MINUS "-"
#define STR_DOT "."
#define STR_SLASH "/"
#define STR_0 "0"
#define STR_1 "1"
#define STR_2 "2"
#define STR_3 "3"
#define STR_4 "4"
#define STR_5 "5"
#define STR_6 "6"
#define STR_7 "7"
#define STR_8 "8"
#define STR_9 "9"
#define STR_COLON ":"
#define STR_SEMICOLON ";"
#define STR_LESS_THAN_SIGN "<"
#define STR_EQUALS_SIGN "="
#define STR_GREATER_THAN_SIGN ">"
#define STR_QUESTION_MARK "?"
#define STR_COMMERCIAL_AT "@"
#define STR_A "A"
#define STR_B "B"
#define STR_C "C"
#define STR_D "D"
#define STR_E "E"
#define STR_F "F"
#define STR_G "G"
#define STR_H "H"
#define STR_I "I"
#define STR_J "J"
#define STR_K "K"
#define STR_L "L"
#define STR_M "M"
#define STR_N "N"
#define STR_O "O"
#define STR_P "P"
#define STR_Q "Q"
#define STR_R "R"
#define STR_S "S"
#define STR_T "T"
#define STR_U "U"
#define STR_V "V"
#define STR_W "W"
#define STR_X "X"
#define STR_Y "Y"
#define STR_Z "Z"
#define STR_LEFT_SQUARE_BRACKET "["
#define STR_BACKSLASH "\\"
#define STR_RIGHT_SQUARE_BRACKET "]"
#define STR_CIRCUMFLEX_ACCENT "^"
#define STR_UNDERSCORE "_"
#define STR_GRAVE_ACCENT "`"
#define STR_a "a"
#define STR_b "b"
#define STR_c "c"
#define STR_d "d"
#define STR_e "e"
#define STR_f "f"
#define STR_g "g"
#define STR_h "h"
#define STR_i "i"
#define STR_j "j"
#define STR_k "k"
#define STR_l "l"
#define STR_m "m"
#define STR_n "n"
#define STR_o "o"
#define STR_p "p"
#define STR_q "q"
#define STR_r "r"
#define STR_s "s"
#define STR_t "t"
#define STR_u "u"
#define STR_v "v"
#define STR_w "w"
#define STR_x "x"
#define STR_y "y"
#define STR_z "z"
#define STR_LEFT_CURLY_BRACKET "{"
#define STR_VERTICAL_LINE "|"
#define STR_RIGHT_CURLY_BRACKET "}"
#define STR_TILDE "~"
#define STRING_ACCEPT0 "ACCEPT\0"
#define STRING_COMMIT0 "COMMIT\0"
#define STRING_F0 "F\0"
#define STRING_FAIL0 "FAIL\0"
#define STRING_PRUNE0 "PRUNE\0"
#define STRING_SKIP0 "SKIP\0"
#define STRING_THEN "THEN"
#define STRING_alpha0 "alpha\0"
#define STRING_lower0 "lower\0"
#define STRING_upper0 "upper\0"
#define STRING_alnum0 "alnum\0"
#define STRING_ascii0 "ascii\0"
#define STRING_blank0 "blank\0"
#define STRING_cntrl0 "cntrl\0"
#define STRING_digit0 "digit\0"
#define STRING_graph0 "graph\0"
#define STRING_print0 "print\0"
#define STRING_punct0 "punct\0"
#define STRING_space0 "space\0"
#define STRING_word0 "word\0"
#define STRING_xdigit "xdigit"
#define STRING_DEFINE "DEFINE"
#define STRING_CR_RIGHTPAR "CR)"
#define STRING_LF_RIGHTPAR "LF)"
#define STRING_CRLF_RIGHTPAR "CRLF)"
#define STRING_ANY_RIGHTPAR "ANY)"
#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)"
#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
#define STRING_UTF8_RIGHTPAR "UTF8)"
#else /* SUPPORT_UTF8 */
/* UTF-8 support is enabled; always use UTF-8 (=ASCII) character codes. This
works in both modes non-EBCDIC platforms, and on EBCDIC platforms in UTF-8 mode
only. */
#define CHAR_HT '\011'
#define CHAR_VT '\013'
#define CHAR_FF '\014'
#define CHAR_CR '\015'
#define CHAR_NL '\012'
#define CHAR_BS '\010'
#define CHAR_BEL '\007'
#define CHAR_ESC '\033'
#define CHAR_DEL '\177'
#define CHAR_SPACE '\040'
#define CHAR_EXCLAMATION_MARK '\041'
#define CHAR_QUOTATION_MARK '\042'
#define CHAR_NUMBER_SIGN '\043'
#define CHAR_DOLLAR_SIGN '\044'
#define CHAR_PERCENT_SIGN '\045'
#define CHAR_AMPERSAND '\046'
#define CHAR_APOSTROPHE '\047'
#define CHAR_LEFT_PARENTHESIS '\050'
#define CHAR_RIGHT_PARENTHESIS '\051'
#define CHAR_ASTERISK '\052'
#define CHAR_PLUS '\053'
#define CHAR_COMMA '\054'
#define CHAR_MINUS '\055'
#define CHAR_DOT '\056'
#define CHAR_SLASH '\057'
#define CHAR_0 '\060'
#define CHAR_1 '\061'
#define CHAR_2 '\062'
#define CHAR_3 '\063'
#define CHAR_4 '\064'
#define CHAR_5 '\065'
#define CHAR_6 '\066'
#define CHAR_7 '\067'
#define CHAR_8 '\070'
#define CHAR_9 '\071'
#define CHAR_COLON '\072'
#define CHAR_SEMICOLON '\073'
#define CHAR_LESS_THAN_SIGN '\074'
#define CHAR_EQUALS_SIGN '\075'
#define CHAR_GREATER_THAN_SIGN '\076'
#define CHAR_QUESTION_MARK '\077'
#define CHAR_COMMERCIAL_AT '\100'
#define CHAR_A '\101'
#define CHAR_B '\102'
#define CHAR_C '\103'
#define CHAR_D '\104'
#define CHAR_E '\105'
#define CHAR_F '\106'
#define CHAR_G '\107'
#define CHAR_H '\110'
#define CHAR_I '\111'
#define CHAR_J '\112'
#define CHAR_K '\113'
#define CHAR_L '\114'
#define CHAR_M '\115'
#define CHAR_N '\116'
#define CHAR_O '\117'
#define CHAR_P '\120'
#define CHAR_Q '\121'
#define CHAR_R '\122'
#define CHAR_S '\123'
#define CHAR_T '\124'
#define CHAR_U '\125'
#define CHAR_V '\126'
#define CHAR_W '\127'
#define CHAR_X '\130'
#define CHAR_Y '\131'
#define CHAR_Z '\132'
#define CHAR_LEFT_SQUARE_BRACKET '\133'
#define CHAR_BACKSLASH '\134'
#define CHAR_RIGHT_SQUARE_BRACKET '\135'
#define CHAR_CIRCUMFLEX_ACCENT '\136'
#define CHAR_UNDERSCORE '\137'
#define CHAR_GRAVE_ACCENT '\140'
#define CHAR_a '\141'
#define CHAR_b '\142'
#define CHAR_c '\143'
#define CHAR_d '\144'
#define CHAR_e '\145'
#define CHAR_f '\146'
#define CHAR_g '\147'
#define CHAR_h '\150'
#define CHAR_i '\151'
#define CHAR_j '\152'
#define CHAR_k '\153'
#define CHAR_l '\154'
#define CHAR_m '\155'
#define CHAR_n '\156'
#define CHAR_o '\157'
#define CHAR_p '\160'
#define CHAR_q '\161'
#define CHAR_r '\162'
#define CHAR_s '\163'
#define CHAR_t '\164'
#define CHAR_u '\165'
#define CHAR_v '\166'
#define CHAR_w '\167'
#define CHAR_x '\170'
#define CHAR_y '\171'
#define CHAR_z '\172'
#define CHAR_LEFT_CURLY_BRACKET '\173'
#define CHAR_VERTICAL_LINE '\174'
#define CHAR_RIGHT_CURLY_BRACKET '\175'
#define CHAR_TILDE '\176'
#define STR_HT "\011"
#define STR_VT "\013"
#define STR_FF "\014"
#define STR_CR "\015"
#define STR_NL "\012"
#define STR_BS "\010"
#define STR_BEL "\007"
#define STR_ESC "\033"
#define STR_DEL "\177"
#define STR_SPACE "\040"
#define STR_EXCLAMATION_MARK "\041"
#define STR_QUOTATION_MARK "\042"
#define STR_NUMBER_SIGN "\043"
#define STR_DOLLAR_SIGN "\044"
#define STR_PERCENT_SIGN "\045"
#define STR_AMPERSAND "\046"
#define STR_APOSTROPHE "\047"
#define STR_LEFT_PARENTHESIS "\050"
#define STR_RIGHT_PARENTHESIS "\051"
#define STR_ASTERISK "\052"
#define STR_PLUS "\053"
#define STR_COMMA "\054"
#define STR_MINUS "\055"
#define STR_DOT "\056"
#define STR_SLASH "\057"
#define STR_0 "\060"
#define STR_1 "\061"
#define STR_2 "\062"
#define STR_3 "\063"
#define STR_4 "\064"
#define STR_5 "\065"
#define STR_6 "\066"
#define STR_7 "\067"
#define STR_8 "\070"
#define STR_9 "\071"
#define STR_COLON "\072"
#define STR_SEMICOLON "\073"
#define STR_LESS_THAN_SIGN "\074"
#define STR_EQUALS_SIGN "\075"
#define STR_GREATER_THAN_SIGN "\076"
#define STR_QUESTION_MARK "\077"
#define STR_COMMERCIAL_AT "\100"
#define STR_A "\101"
#define STR_B "\102"
#define STR_C "\103"
#define STR_D "\104"
#define STR_E "\105"
#define STR_F "\106"
#define STR_G "\107"
#define STR_H "\110"
#define STR_I "\111"
#define STR_J "\112"
#define STR_K "\113"
#define STR_L "\114"
#define STR_M "\115"
#define STR_N "\116"
#define STR_O "\117"
#define STR_P "\120"
#define STR_Q "\121"
#define STR_R "\122"
#define STR_S "\123"
#define STR_T "\124"
#define STR_U "\125"
#define STR_V "\126"
#define STR_W "\127"
#define STR_X "\130"
#define STR_Y "\131"
#define STR_Z "\132"
#define STR_LEFT_SQUARE_BRACKET "\133"
#define STR_BACKSLASH "\134"
#define STR_RIGHT_SQUARE_BRACKET "\135"
#define STR_CIRCUMFLEX_ACCENT "\136"
#define STR_UNDERSCORE "\137"
#define STR_GRAVE_ACCENT "\140"
#define STR_a "\141"
#define STR_b "\142"
#define STR_c "\143"
#define STR_d "\144"
#define STR_e "\145"
#define STR_f "\146"
#define STR_g "\147"
#define STR_h "\150"
#define STR_i "\151"
#define STR_j "\152"
#define STR_k "\153"
#define STR_l "\154"
#define STR_m "\155"
#define STR_n "\156"
#define STR_o "\157"
#define STR_p "\160"
#define STR_q "\161"
#define STR_r "\162"
#define STR_s "\163"
#define STR_t "\164"
#define STR_u "\165"
#define STR_v "\166"
#define STR_w "\167"
#define STR_x "\170"
#define STR_y "\171"
#define STR_z "\172"
#define STR_LEFT_CURLY_BRACKET "\173"
#define STR_VERTICAL_LINE "\174"
#define STR_RIGHT_CURLY_BRACKET "\175"
#define STR_TILDE "\176"
#define STRING_ACCEPT0 STR_A STR_C STR_C STR_E STR_P STR_T "\0"
#define STRING_COMMIT0 STR_C STR_O STR_M STR_M STR_I STR_T "\0"
#define STRING_F0 STR_F "\0"
#define STRING_FAIL0 STR_F STR_A STR_I STR_L "\0"
#define STRING_PRUNE0 STR_P STR_R STR_U STR_N STR_E "\0"
#define STRING_SKIP0 STR_S STR_K STR_I STR_P "\0"
#define STRING_THEN STR_T STR_H STR_E STR_N
#define STRING_alpha0 STR_a STR_l STR_p STR_h STR_a "\0"
#define STRING_lower0 STR_l STR_o STR_w STR_e STR_r "\0"
#define STRING_upper0 STR_u STR_p STR_p STR_e STR_r "\0"
#define STRING_alnum0 STR_a STR_l STR_n STR_u STR_m "\0"
#define STRING_ascii0 STR_a STR_s STR_c STR_i STR_i "\0"
#define STRING_blank0 STR_b STR_l STR_a STR_n STR_k "\0"
#define STRING_cntrl0 STR_c STR_n STR_t STR_r STR_l "\0"
#define STRING_digit0 STR_d STR_i STR_g STR_i STR_t "\0"
#define STRING_graph0 STR_g STR_r STR_a STR_p STR_h "\0"
#define STRING_print0 STR_p STR_r STR_i STR_n STR_t "\0"
#define STRING_punct0 STR_p STR_u STR_n STR_c STR_t "\0"
#define STRING_space0 STR_s STR_p STR_a STR_c STR_e "\0"
#define STRING_word0 STR_w STR_o STR_r STR_d "\0"
#define STRING_xdigit STR_x STR_d STR_i STR_g STR_i STR_t
#define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E
#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
#endif /* SUPPORT_UTF8 */
/* Escape items that are just an encoding of a particular data value. */ /* Escape items that are just an encoding of a particular data value. */
#ifndef ESC_e #ifndef ESC_e
#define ESC_e 27 #define ESC_e CHAR_ESC
#endif #endif
#ifndef ESC_f #ifndef ESC_f
#define ESC_f '\f' #define ESC_f CHAR_FF
#endif #endif
#ifndef ESC_n #ifndef ESC_n
#define ESC_n '\n' #define ESC_n CHAR_NL
#endif #endif
#ifndef ESC_r #ifndef ESC_r
#define ESC_r '\r' #define ESC_r CHAR_CR
#endif #endif
/* We can't officially use ESC_t because it is a POSIX reserved identifier /* We can't officially use ESC_t because it is a POSIX reserved identifier
(presumably because of all the others like size_t). */ (presumably because of all the others like size_t). */
#ifndef ESC_tee #ifndef ESC_tee
#define ESC_tee '\t' #define ESC_tee CHAR_HT
#endif #endif
/* Codes for different types of Unicode property */ /* Codes for different types of Unicode property */
@ -632,8 +1223,8 @@ enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
OP_EOD must correspond in order to the list of escapes immediately above. OP_EOD must correspond in order to the list of escapes immediately above.
*** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions *** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions
that follow must also be updated to match. There is also a table called that follow must also be updated to match. There are also tables called
"coptable" in pcre_dfa_exec.c that must be updated. */ "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */
enum { enum {
OP_END, /* 0 End of pattern */ OP_END, /* 0 End of pattern */
@ -769,30 +1360,45 @@ enum {
OP_SCBRA, /* 98 Start of capturing bracket, check empty */ OP_SCBRA, /* 98 Start of capturing bracket, check empty */
OP_SCOND, /* 99 Conditional group, check empty */ OP_SCOND, /* 99 Conditional group, check empty */
OP_CREF, /* 100 Used to hold a capture number as condition */ /* The next two pairs must (respectively) be kept together. */
OP_RREF, /* 101 Used to hold a recursion number as condition */
OP_DEF, /* 102 The DEFINE condition */
OP_BRAZERO, /* 103 These two must remain together and in this */ OP_CREF, /* 100 Used to hold a capture number as condition */
OP_BRAMINZERO, /* 104 order. */ OP_NCREF, /* 101 Same, but generaged by a name reference*/
OP_RREF, /* 102 Used to hold a recursion number as condition */
OP_NRREF, /* 103 Same, but generaged by a name reference*/
OP_DEF, /* 104 The DEFINE condition */
OP_BRAZERO, /* 105 These two must remain together and in this */
OP_BRAMINZERO, /* 106 order. */
/* These are backtracking control verbs */ /* These are backtracking control verbs */
OP_PRUNE, /* 105 */ OP_PRUNE, /* 107 */
OP_SKIP, /* 106 */ OP_SKIP, /* 108 */
OP_THEN, /* 107 */ OP_THEN, /* 109 */
OP_COMMIT, /* 108 */ OP_COMMIT, /* 110 */
/* These are forced failure and success verbs */ /* These are forced failure and success verbs */
OP_FAIL, /* 109 */ OP_FAIL, /* 111 */
OP_ACCEPT, /* 110 */ OP_ACCEPT, /* 112 */
OP_CLOSE, /* 113 Used before OP_ACCEPT to close open captures */
/* This is used to skip a subpattern with a {0} quantifier */ /* This is used to skip a subpattern with a {0} quantifier */
OP_SKIPZERO /* 111 */ OP_SKIPZERO, /* 114 */
/* This is not an opcode, but is used to check that tables indexed by opcode
are the correct length, in order to catch updating errors - there have been
some in the past. */
OP_TABLE_LENGTH
}; };
/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
definitions that follow must also be updated to match. There are also tables
called "coptable" cna "poptable" in pcre_dfa_exec.c that must be updated. */
/* This macro defines textual names for all the opcodes. These are used only /* This macro defines textual names for all the opcodes. These are used only
for debugging. The macro is referenced only in pcre_printint.c. */ for debugging. The macro is referenced only in pcre_printint.c. */
@ -814,9 +1420,10 @@ for debugging. The macro is referenced only in pcre_printint.c. */
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \ "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
"AssertB", "AssertB not", "Reverse", \ "AssertB", "AssertB not", "Reverse", \
"Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \ "Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
"Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero", \ "Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def", \
"Brazero", "Braminzero", \
"*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \ "*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
"Skip zero" "Close", "Skip zero"
/* This macro defines the length of fixed length operations in the compiled /* This macro defines the length of fixed length operations in the compiled
@ -833,8 +1440,9 @@ in UTF-8 mode. The code that uses this table must know about such things. */
1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \ 1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \
1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \ 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \
1, 1, 1, /* Any, AllAny, Anybyte */ \ 1, 1, 1, /* Any, AllAny, Anybyte */ \
3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \ 3, 3, /* \P, \p */ \
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \ 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \
1, /* \X */ \
1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \ 1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
2, /* Char - the minimum length */ \ 2, /* Char - the minimum length */ \
2, /* Charnc - the minimum length */ \ 2, /* Charnc - the minimum length */ \
@ -876,20 +1484,22 @@ in UTF-8 mode. The code that uses this table must know about such things. */
1+LINK_SIZE, /* SBRA */ \ 1+LINK_SIZE, /* SBRA */ \
3+LINK_SIZE, /* SCBRA */ \ 3+LINK_SIZE, /* SCBRA */ \
1+LINK_SIZE, /* SCOND */ \ 1+LINK_SIZE, /* SCOND */ \
3, /* CREF */ \ 3, 3, /* CREF, NCREF */ \
3, /* RREF */ \ 3, 3, /* RREF, NRREF */ \
1, /* DEF */ \ 1, /* DEF */ \
1, 1, /* BRAZERO, BRAMINZERO */ \ 1, 1, /* BRAZERO, BRAMINZERO */ \
1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \ 1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \
1, 1, 1 /* FAIL, ACCEPT, SKIPZERO */ 1, 1, 3, 1 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
/* A magic value for OP_RREF to indicate the "any recursion" condition. */ /* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion"
condition. */
#define RREF_ANY 0xffff #define RREF_ANY 0xffff
/* Error code numbers. They are given names so that they can more easily be /* Compile time error code numbers. They are given names so that they can more
tracked. */ easily be tracked. When a new number is added, the table called eint in
pcreposix.c must be updated. */
enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9, enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19, ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19,
@ -897,7 +1507,7 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39, ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49, ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59, ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
ERR60, ERR61, ERR62, ERR63, ERR64 }; ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERRCOUNT };
/* The real format of the start of the pcre block; the index of names and the /* The real format of the start of the pcre block; the index of names and the
code vector run on as long as necessary after the end. We store an explicit code vector run on as long as necessary after the end. We store an explicit
@ -913,7 +1523,7 @@ Because people can now save and re-use compiled patterns, any additions to this
structure should be made at the end, and something earlier (e.g. a new structure should be made at the end, and something earlier (e.g. a new
flag in the options or one of the dummy fields) should indicate that the new flag in the options or one of the dummy fields) should indicate that the new
fields are present. Currently PCRE always sets the dummy fields to zero. fields are present. Currently PCRE always sets the dummy fields to zero.
NOTE NOTE NOTE: NOTE NOTE NOTE
*/ */
typedef struct real_pcre { typedef struct real_pcre {
@ -940,10 +1550,22 @@ remark (see NOTE above) about extending this structure applies. */
typedef struct pcre_study_data { typedef struct pcre_study_data {
pcre_uint32 size; /* Total that was malloced */ pcre_uint32 size; /* Total that was malloced */
pcre_uint32 options; pcre_uint32 flags; /* Private flags */
uschar start_bits[32]; uschar start_bits[32]; /* Starting char bits */
pcre_uint32 minlength; /* Minimum subject length */
} pcre_study_data; } pcre_study_data;
/* Structure for building a chain of open capturing subpatterns during
compiling, so that instructions to close them can be compiled when (*ACCEPT) is
encountered. This is also used to identify subpatterns that contain recursive
back references to themselves, so that they can be made atomic. */
typedef struct open_capitem {
struct open_capitem *next; /* Chain link */
pcre_uint16 number; /* Capture number */
pcre_uint16 flag; /* Set TRUE if recursive back ref */
} open_capitem;
/* Structure for passing "static" information around between the functions /* Structure for passing "static" information around between the functions
doing the compiling, so that they are thread-safe. */ doing the compiling, so that they are thread-safe. */
@ -956,6 +1578,7 @@ typedef struct compile_data {
const uschar *start_code; /* The start of the compiled code */ const uschar *start_code; /* The start of the compiled code */
const uschar *start_pattern; /* The start of the pattern */ const uschar *start_pattern; /* The start of the pattern */
const uschar *end_pattern; /* The end of the pattern */ const uschar *end_pattern; /* The end of the pattern */
open_capitem *open_caps; /* Chain of open capture items */
uschar *hwm; /* High watermark of workspace */ uschar *hwm; /* High watermark of workspace */
uschar *name_table; /* The name/number table */ uschar *name_table; /* The name/number table */
int names_found; /* Number of entries so far */ int names_found; /* Number of entries so far */
@ -968,6 +1591,7 @@ typedef struct compile_data {
int external_flags; /* External flag bits to be set */ int external_flags; /* External flag bits to be set */
int req_varyopt; /* "After variable item" flag for reqbyte */ int req_varyopt; /* "After variable item" flag for reqbyte */
BOOL had_accept; /* (*ACCEPT) encountered */ BOOL had_accept; /* (*ACCEPT) encountered */
BOOL check_lookbehind; /* Lookbehinds need later checking */
int nltype; /* Newline type */ int nltype; /* Newline type */
int nllen; /* Newline string length */ int nllen; /* Newline string length */
uschar nl[4]; /* Newline string when fixed length */ uschar nl[4]; /* Newline string when fixed length */
@ -978,7 +1602,7 @@ branches, for testing for left recursion. */
typedef struct branch_chain { typedef struct branch_chain {
struct branch_chain *outer; struct branch_chain *outer;
uschar *current; uschar *current_branch;
} branch_chain; } branch_chain;
/* Structure for items in a linked list that represents an explicit recursive /* Structure for items in a linked list that represents an explicit recursive
@ -988,9 +1612,9 @@ typedef struct recursion_info {
struct recursion_info *prevrec; /* Previous recursion record (or NULL) */ struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
int group_num; /* Number of group that was called */ int group_num; /* Number of group that was called */
const uschar *after_call; /* "Return value": points after the call in the expr */ const uschar *after_call; /* "Return value": points after the call in the expr */
USPTR save_start; /* Old value of mstart */
int *offset_save; /* Pointer to start of saved offsets */ int *offset_save; /* Pointer to start of saved offsets */
int saved_max; /* Number of saved offsets */ int saved_max; /* Number of saved offsets */
int save_offset_top; /* Current value of offset_top */
} recursion_info; } recursion_info;
/* Structure for building a chain of data for holding the values of the subject /* Structure for building a chain of data for holding the values of the subject
@ -1015,6 +1639,9 @@ typedef struct match_data {
int offset_max; /* The maximum usable for return data */ int offset_max; /* The maximum usable for return data */
int nltype; /* Newline type */ int nltype; /* Newline type */
int nllen; /* Newline string length */ int nllen; /* Newline string length */
int name_count; /* Number of names in name table */
int name_entry_size; /* Size of entry in names table */
uschar *name_table; /* Table of names */
uschar nl[4]; /* Newline string when fixed */ uschar nl[4]; /* Newline string when fixed */
const uschar *lcc; /* Points to lower casing table */ const uschar *lcc; /* Points to lower casing table */
const uschar *ctypes; /* Points to table of type maps */ const uschar *ctypes; /* Points to table of type maps */
@ -1025,7 +1652,7 @@ typedef struct match_data {
BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */ BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */
BOOL endonly; /* Dollar not before final \n */ BOOL endonly; /* Dollar not before final \n */
BOOL notempty; /* Empty string match not wanted */ BOOL notempty; /* Empty string match not wanted */
BOOL partial; /* PARTIAL flag */ BOOL notempty_atstart; /* Empty string match at start not wanted */
BOOL hitend; /* Hit the end of the subject at some point */ BOOL hitend; /* Hit the end of the subject at some point */
BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */ BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */
const uschar *start_code; /* For use when recursing */ const uschar *start_code; /* For use when recursing */
@ -1033,6 +1660,8 @@ typedef struct match_data {
USPTR end_subject; /* End of the subject string */ USPTR end_subject; /* End of the subject string */
USPTR start_match_ptr; /* Start of matched string */ USPTR start_match_ptr; /* Start of matched string */
USPTR end_match_ptr; /* Subject position at end match */ USPTR end_match_ptr; /* Subject position at end match */
USPTR start_used_ptr; /* Earliest consulted character */
int partial; /* PARTIAL options */
int end_offset_top; /* Highwater mark at end of match */ int end_offset_top; /* Highwater mark at end of match */
int capture_last; /* Most recent capture number */ int capture_last; /* Most recent capture number */
int start_offset; /* The start offset value */ int start_offset; /* The start offset value */
@ -1049,7 +1678,9 @@ typedef struct dfa_match_data {
const uschar *start_code; /* Start of the compiled pattern */ const uschar *start_code; /* Start of the compiled pattern */
const uschar *start_subject; /* Start of the subject string */ const uschar *start_subject; /* Start of the subject string */
const uschar *end_subject; /* End of subject string */ const uschar *end_subject; /* End of subject string */
const uschar *start_used_ptr; /* Earliest consulted character */
const uschar *tables; /* Character tables */ const uschar *tables; /* Character tables */
int start_offset; /* The start offset value */
int moptions; /* Match options */ int moptions; /* Match options */
int poptions; /* Pattern options */ int poptions; /* Pattern options */
int nltype; /* Newline type */ int nltype; /* Newline type */
@ -1128,20 +1759,30 @@ extern const uschar _pcre_OP_lengths[];
one of the exported public functions. They have to be "external" in the C one of the exported public functions. They have to be "external" in the C
sense, but are not part of the PCRE public API. */ sense, but are not part of the PCRE public API. */
extern BOOL _pcre_is_newline(const uschar *, int, const uschar *, extern const uschar *_pcre_find_bracket(const uschar *, BOOL, int);
int *, BOOL); extern BOOL _pcre_is_newline(USPTR, int, USPTR, int *, BOOL);
extern int _pcre_ord2utf8(int, uschar *); extern int _pcre_ord2utf8(int, uschar *);
extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *, extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
const pcre_study_data *, pcre_study_data *); const pcre_study_data *, pcre_study_data *);
extern int _pcre_valid_utf8(const uschar *, int); extern int _pcre_valid_utf8(USPTR, int);
extern BOOL _pcre_was_newline(const uschar *, int, const uschar *, extern BOOL _pcre_was_newline(USPTR, int, USPTR, int *, BOOL);
int *, BOOL); extern BOOL _pcre_xclass(int, const uschar *);
extern BOOL _pcre_xclass(int, const uschar *);
extern unsigned int _pcre_ucp_othercase(unsigned int);
/* Unicode character database (UCD) */
typedef struct {
uschar script;
uschar chartype;
pcre_int32 other_case;
} ucd_record;
extern const ucd_record _pcre_ucd_records[];
extern const uschar _pcre_ucd_stage1[];
extern const pcre_uint16 _pcre_ucd_stage2[];
extern const int _pcre_ucp_gentype[]; extern const int _pcre_ucp_gentype[];
extern unsigned int _pcre_ucp_othercase (unsigned int);
/* UCD access macros */ /* UCD access macros */

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language. and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Copyright (c) 1997-2008 University of Cambridge Copyright (c) 1997-2009 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -73,8 +73,7 @@ Returns: TRUE or FALSE
*/ */
BOOL BOOL
_pcre_is_newline(const uschar *ptr, int type, const uschar *endptr, _pcre_is_newline(USPTR ptr, int type, USPTR endptr, int *lenptr, BOOL utf8)
int *lenptr, BOOL utf8)
{ {
int c; int c;
if (utf8) { GETCHAR(c, ptr); } else c = *ptr; if (utf8) { GETCHAR(c, ptr); } else c = *ptr;
@ -123,8 +122,7 @@ Returns: TRUE or FALSE
*/ */
BOOL BOOL
_pcre_was_newline(const uschar *ptr, int type, const uschar *startptr, _pcre_was_newline(USPTR ptr, int type, USPTR startptr, int *lenptr, BOOL utf8)
int *lenptr, BOOL utf8)
{ {
int c; int c;
ptr--; ptr--;

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language. and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Copyright (c) 1997-2008 University of Cambridge Copyright (c) 1997-2010 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -54,6 +54,379 @@ supporting functions. */
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE }; enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
/*************************************************
* Find the minimum subject length for a group *
*************************************************/
/* Scan a parenthesized group and compute the minimum length of subject that
is needed to match it. This is a lower bound; it does not mean there is a
string of that length that matches. In UTF8 mode, the result is in characters
rather than bytes.
Arguments:
code pointer to start of group (the bracket)
startcode pointer to start of the whole pattern
options the compiling options
Returns: the minimum length
-1 if \C was encountered
-2 internal error (missing capturing bracket)
*/
static int
find_minlength(const uschar *code, const uschar *startcode, int options)
{
int length = -1;
BOOL utf8 = (options & PCRE_UTF8) != 0;
BOOL had_recurse = FALSE;
register int branchlength = 0;
register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
/* Scan along the opcodes for this branch. If we get to the end of the
branch, check the length against that of the other branches. */
for (;;)
{
int d, min;
uschar *cs, *ce;
register int op = *cc;
switch (op)
{
case OP_COND:
case OP_SCOND:
/* If there is only one branch in a condition, the implied branch has zero
length, so we don't add anything. This covers the DEFINE "condition"
automatically. */
cs = cc + GET(cc, 1);
if (*cs != OP_ALT)
{
cc = cs + 1 + LINK_SIZE;
break;
}
/* Otherwise we can fall through and treat it the same as any other
subpattern. */
case OP_CBRA:
case OP_SCBRA:
case OP_BRA:
case OP_SBRA:
case OP_ONCE:
d = find_minlength(cc, startcode, options);
if (d < 0) return d;
branchlength += d;
do cc += GET(cc, 1); while (*cc == OP_ALT);
cc += 1 + LINK_SIZE;
break;
/* Reached end of a branch; if it's a ket it is the end of a nested
call. If it's ALT it is an alternation in a nested call. If it is
END it's the end of the outer call. All can be handled by the same code. */
case OP_ALT:
case OP_KET:
case OP_KETRMAX:
case OP_KETRMIN:
case OP_END:
if (length < 0 || (!had_recurse && branchlength < length))
length = branchlength;
if (*cc != OP_ALT) return length;
cc += 1 + LINK_SIZE;
branchlength = 0;
had_recurse = FALSE;
break;
/* Skip over assertive subpatterns */
case OP_ASSERT:
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
do cc += GET(cc, 1); while (*cc == OP_ALT);
/* Fall through */
/* Skip over things that don't match chars */
case OP_REVERSE:
case OP_CREF:
case OP_NCREF:
case OP_RREF:
case OP_NRREF:
case OP_DEF:
case OP_OPT:
case OP_CALLOUT:
case OP_SOD:
case OP_SOM:
case OP_EOD:
case OP_EODN:
case OP_CIRC:
case OP_DOLL:
case OP_NOT_WORD_BOUNDARY:
case OP_WORD_BOUNDARY:
cc += _pcre_OP_lengths[*cc];
break;
/* Skip over a subpattern that has a {0} or {0,x} quantifier */
case OP_BRAZERO:
case OP_BRAMINZERO:
case OP_SKIPZERO:
cc += _pcre_OP_lengths[*cc];
do cc += GET(cc, 1); while (*cc == OP_ALT);
cc += 1 + LINK_SIZE;
break;
/* Handle literal characters and + repetitions */
case OP_CHAR:
case OP_CHARNC:
case OP_NOT:
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
case OP_NOTPLUS:
case OP_NOTMINPLUS:
case OP_NOTPOSPLUS:
branchlength++;
cc += 2;
#ifdef SUPPORT_UTF8
if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
#endif
break;
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
case OP_TYPEPOSPLUS:
branchlength++;
cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
break;
/* Handle exact repetitions. The count is already in characters, but we
need to skip over a multibyte character in UTF8 mode. */
case OP_EXACT:
case OP_NOTEXACT:
branchlength += GET2(cc,1);
cc += 4;
#ifdef SUPPORT_UTF8
if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
#endif
break;
case OP_TYPEEXACT:
branchlength += GET2(cc,1);
cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
break;
/* Handle single-char non-literal matchers */
case OP_PROP:
case OP_NOTPROP:
cc += 2;
/* Fall through */
case OP_NOT_DIGIT:
case OP_DIGIT:
case OP_NOT_WHITESPACE:
case OP_WHITESPACE:
case OP_NOT_WORDCHAR:
case OP_WORDCHAR:
case OP_ANY:
case OP_ALLANY:
case OP_EXTUNI:
case OP_HSPACE:
case OP_NOT_HSPACE:
case OP_VSPACE:
case OP_NOT_VSPACE:
branchlength++;
cc++;
break;
/* "Any newline" might match two characters */
case OP_ANYNL:
branchlength += 2;
cc++;
break;
/* The single-byte matcher means we can't proceed in UTF-8 mode */
case OP_ANYBYTE:
#ifdef SUPPORT_UTF8
if (utf8) return -1;
#endif
branchlength++;
cc++;
break;
/* For repeated character types, we have to test for \p and \P, which have
an extra two bytes of parameters. */
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
case OP_TYPEPOSSTAR:
case OP_TYPEPOSQUERY:
if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
cc += _pcre_OP_lengths[op];
break;
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
case OP_TYPEPOSUPTO:
if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
cc += _pcre_OP_lengths[op];
break;
/* Check a class for variable quantification */
#ifdef SUPPORT_UTF8
case OP_XCLASS:
cc += GET(cc, 1) - 33;
/* Fall through */
#endif
case OP_CLASS:
case OP_NCLASS:
cc += 33;
switch (*cc)
{
case OP_CRPLUS:
case OP_CRMINPLUS:
branchlength++;
/* Fall through */
case OP_CRSTAR:
case OP_CRMINSTAR:
case OP_CRQUERY:
case OP_CRMINQUERY:
cc++;
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
branchlength += GET2(cc,1);
cc += 5;
break;
default:
branchlength++;
break;
}
break;
/* Backreferences and subroutine calls are treated in the same way: we find
the minimum length for the subpattern. A recursion, however, causes an
a flag to be set that causes the length of this branch to be ignored. The
logic is that a recursion can only make sense if there is another
alternation that stops the recursing. That will provide the minimum length
(when no recursion happens). A backreference within the group that it is
referencing behaves in the same way.
If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
matches an empty string (by default it causes a matching failure), so in
that case we must set the minimum length to zero. */
case OP_REF:
if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
{
ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce)
{
d = 0;
had_recurse = TRUE;
}
else d = find_minlength(cs, startcode, options);
}
else d = 0;
cc += 3;
/* Handle repeated back references */
switch (*cc)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
case OP_CRQUERY:
case OP_CRMINQUERY:
min = 0;
cc++;
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
min = GET2(cc, 1);
cc += 5;
break;
default:
min = 1;
break;
}
branchlength += min * d;
break;
case OP_RECURSE:
cs = ce = (uschar *)startcode + GET(cc, 1);
if (cs == NULL) return -2;
do ce += GET(ce, 1); while (*ce == OP_ALT);
if (cc > cs && cc < ce)
had_recurse = TRUE;
else
branchlength += find_minlength(cs, startcode, options);
cc += 1 + LINK_SIZE;
break;
/* Anything else does not or need not match a character. We can get the
item's length from the table, but for those that can match zero occurrences
of a character, we must take special action for UTF-8 characters. */
case OP_UPTO:
case OP_NOTUPTO:
case OP_MINUPTO:
case OP_NOTMINUPTO:
case OP_POSUPTO:
case OP_STAR:
case OP_MINSTAR:
case OP_NOTMINSTAR:
case OP_POSSTAR:
case OP_NOTPOSSTAR:
case OP_QUERY:
case OP_MINQUERY:
case OP_NOTMINQUERY:
case OP_POSQUERY:
case OP_NOTPOSQUERY:
cc += _pcre_OP_lengths[op];
#ifdef SUPPORT_UTF8
if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
#endif
break;
/* For the record, these are the opcodes that are matched by "default":
OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
OP_THEN. */
default:
cc += _pcre_OP_lengths[op];
break;
}
}
/* Control never gets here */
}
/************************************************* /*************************************************
* Set a bit and maybe its alternate case * * Set a bit and maybe its alternate case *
*************************************************/ *************************************************/
@ -71,7 +444,8 @@ Returns: nothing
*/ */
static void static void
set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd) set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless,
compile_data *cd)
{ {
start_bits[c/8] |= (1 << (c&7)); start_bits[c/8] |= (1 << (c&7));
if (caseless && (cd->ctypes[c] & ctype_letter) != 0) if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
@ -233,7 +607,7 @@ do
case OP_QUERY: case OP_QUERY:
case OP_MINQUERY: case OP_MINQUERY:
case OP_POSQUERY: case OP_POSQUERY:
set_bit(start_bits, tcode[1], caseless, cd); set_table_bit(start_bits, tcode[1], caseless, cd);
tcode += 2; tcode += 2;
#ifdef SUPPORT_UTF8 #ifdef SUPPORT_UTF8
if (utf8 && tcode[-1] >= 0xc0) if (utf8 && tcode[-1] >= 0xc0)
@ -246,7 +620,7 @@ do
case OP_UPTO: case OP_UPTO:
case OP_MINUPTO: case OP_MINUPTO:
case OP_POSUPTO: case OP_POSUPTO:
set_bit(start_bits, tcode[3], caseless, cd); set_table_bit(start_bits, tcode[3], caseless, cd);
tcode += 4; tcode += 4;
#ifdef SUPPORT_UTF8 #ifdef SUPPORT_UTF8
if (utf8 && tcode[-1] >= 0xc0) if (utf8 && tcode[-1] >= 0xc0)
@ -264,7 +638,7 @@ do
case OP_PLUS: case OP_PLUS:
case OP_MINPLUS: case OP_MINPLUS:
case OP_POSPLUS: case OP_POSPLUS:
set_bit(start_bits, tcode[1], caseless, cd); set_table_bit(start_bits, tcode[1], caseless, cd);
try_next = FALSE; try_next = FALSE;
break; break;
@ -500,13 +874,15 @@ Arguments:
set NULL unless error set NULL unless error
Returns: pointer to a pcre_extra block, with study_data filled in and the Returns: pointer to a pcre_extra block, with study_data filled in and the
appropriate flag set; appropriate flags set;
NULL on error or if no optimization possible NULL on error or if no optimization possible
*/ */
PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
pcre_study(const pcre *external_re, int options, const char **errorptr) pcre_study(const pcre *external_re, int options, const char **errorptr)
{ {
int min;
BOOL bits_set = FALSE;
uschar start_bits[32]; uschar start_bits[32];
pcre_extra *extra; pcre_extra *extra;
pcre_study_data *study; pcre_study_data *study;
@ -533,30 +909,39 @@ code = (uschar *)re + re->name_table_offset +
(re->name_count * re->name_entry_size); (re->name_count * re->name_entry_size);
/* For an anchored pattern, or an unanchored pattern that has a first char, or /* For an anchored pattern, or an unanchored pattern that has a first char, or
a multiline pattern that matches only at "line starts", no further processing a multiline pattern that matches only at "line starts", there is no point in
at present. */ seeking a list of starting bytes. */
if ((re->options & PCRE_ANCHORED) != 0 || if ((re->options & PCRE_ANCHORED) == 0 &&
(re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0) (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
return NULL; {
/* Set the character tables in the block that is passed around */
/* Set the character tables in the block that is passed around */ tables = re->tables;
if (tables == NULL)
(void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
(void *)(&tables));
tables = re->tables; compile_block.lcc = tables + lcc_offset;
if (tables == NULL) compile_block.fcc = tables + fcc_offset;
(void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES, compile_block.cbits = tables + cbits_offset;
(void *)(&tables)); compile_block.ctypes = tables + ctypes_offset;
compile_block.lcc = tables + lcc_offset; /* See if we can find a fixed set of initial characters for the pattern. */
compile_block.fcc = tables + fcc_offset;
compile_block.cbits = tables + cbits_offset;
compile_block.ctypes = tables + ctypes_offset;
/* See if we can find a fixed set of initial characters for the pattern. */ memset(start_bits, 0, 32 * sizeof(uschar));
bits_set = set_start_bits(code, start_bits,
(re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
&compile_block) == SSB_DONE;
}
memset(start_bits, 0, 32 * sizeof(uschar)); /* Find the minimum length of subject string. */
if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
(re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL; min = find_minlength(code, code, re->options);
/* Return NULL if no optimization is possible. */
if (!bits_set && min < 0) return NULL;
/* Get a pcre_extra block and a pcre_study_data block. The study data is put in /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
the latter, which is pointed to by the former, which may also get additional the latter, which is pointed to by the former, which may also get additional
@ -579,8 +964,19 @@ extra->flags = PCRE_EXTRA_STUDY_DATA;
extra->study_data = study; extra->study_data = study;
study->size = sizeof(pcre_study_data); study->size = sizeof(pcre_study_data);
study->options = PCRE_STUDY_MAPPED; study->flags = 0;
memcpy(study->start_bits, start_bits, sizeof(start_bits));
if (bits_set)
{
study->flags |= PCRE_STUDY_MAPPED;
memcpy(study->start_bits, start_bits, sizeof(start_bits));
}
if (min >= 0)
{
study->flags |= PCRE_STUDY_MINLEN;
study->minlength = min;
}
return extra; return extra;
} }

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language. and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Copyright (c) 1997-2008 University of Cambridge Copyright (c) 1997-2009 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -109,244 +109,411 @@ putting all the names into a single, large string and then using offsets in the
table itself. Maintenance is more error-prone, but frequent changes to this table itself. Maintenance is more error-prone, but frequent changes to this
data are unlikely. data are unlikely.
July 2008: There is now a script called maint/GenerateUtt.py which can be used July 2008: There is now a script called maint/GenerateUtt.py that can be used
to generate this data instead of maintaining it entirely by hand. */ to generate this data instead of maintaining it entirely by hand.
The script was updated in March 2009 to generate a new EBCDIC-compliant
version. Like all other character and string literals that are compared against
the regular expression pattern, we must use STR_ macros instead of literal
strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Any0 STR_A STR_n STR_y "\0"
#define STRING_Arabic0 STR_A STR_r STR_a STR_b STR_i STR_c "\0"
#define STRING_Armenian0 STR_A STR_r STR_m STR_e STR_n STR_i STR_a STR_n "\0"
#define STRING_Avestan0 STR_A STR_v STR_e STR_s STR_t STR_a STR_n "\0"
#define STRING_Balinese0 STR_B STR_a STR_l STR_i STR_n STR_e STR_s STR_e "\0"
#define STRING_Bamum0 STR_B STR_a STR_m STR_u STR_m "\0"
#define STRING_Bengali0 STR_B STR_e STR_n STR_g STR_a STR_l STR_i "\0"
#define STRING_Bopomofo0 STR_B STR_o STR_p STR_o STR_m STR_o STR_f STR_o "\0"
#define STRING_Braille0 STR_B STR_r STR_a STR_i STR_l STR_l STR_e "\0"
#define STRING_Buginese0 STR_B STR_u STR_g STR_i STR_n STR_e STR_s STR_e "\0"
#define STRING_Buhid0 STR_B STR_u STR_h STR_i STR_d "\0"
#define STRING_C0 STR_C "\0"
#define STRING_Canadian_Aboriginal0 STR_C STR_a STR_n STR_a STR_d STR_i STR_a STR_n STR_UNDERSCORE STR_A STR_b STR_o STR_r STR_i STR_g STR_i STR_n STR_a STR_l "\0"
#define STRING_Carian0 STR_C STR_a STR_r STR_i STR_a STR_n "\0"
#define STRING_Cc0 STR_C STR_c "\0"
#define STRING_Cf0 STR_C STR_f "\0"
#define STRING_Cham0 STR_C STR_h STR_a STR_m "\0"
#define STRING_Cherokee0 STR_C STR_h STR_e STR_r STR_o STR_k STR_e STR_e "\0"
#define STRING_Cn0 STR_C STR_n "\0"
#define STRING_Co0 STR_C STR_o "\0"
#define STRING_Common0 STR_C STR_o STR_m STR_m STR_o STR_n "\0"
#define STRING_Coptic0 STR_C STR_o STR_p STR_t STR_i STR_c "\0"
#define STRING_Cs0 STR_C STR_s "\0"
#define STRING_Cuneiform0 STR_C STR_u STR_n STR_e STR_i STR_f STR_o STR_r STR_m "\0"
#define STRING_Cypriot0 STR_C STR_y STR_p STR_r STR_i STR_o STR_t "\0"
#define STRING_Cyrillic0 STR_C STR_y STR_r STR_i STR_l STR_l STR_i STR_c "\0"
#define STRING_Deseret0 STR_D STR_e STR_s STR_e STR_r STR_e STR_t "\0"
#define STRING_Devanagari0 STR_D STR_e STR_v STR_a STR_n STR_a STR_g STR_a STR_r STR_i "\0"
#define STRING_Egyptian_Hieroglyphs0 STR_E STR_g STR_y STR_p STR_t STR_i STR_a STR_n STR_UNDERSCORE STR_H STR_i STR_e STR_r STR_o STR_g STR_l STR_y STR_p STR_h STR_s "\0"
#define STRING_Ethiopic0 STR_E STR_t STR_h STR_i STR_o STR_p STR_i STR_c "\0"
#define STRING_Georgian0 STR_G STR_e STR_o STR_r STR_g STR_i STR_a STR_n "\0"
#define STRING_Glagolitic0 STR_G STR_l STR_a STR_g STR_o STR_l STR_i STR_t STR_i STR_c "\0"
#define STRING_Gothic0 STR_G STR_o STR_t STR_h STR_i STR_c "\0"
#define STRING_Greek0 STR_G STR_r STR_e STR_e STR_k "\0"
#define STRING_Gujarati0 STR_G STR_u STR_j STR_a STR_r STR_a STR_t STR_i "\0"
#define STRING_Gurmukhi0 STR_G STR_u STR_r STR_m STR_u STR_k STR_h STR_i "\0"
#define STRING_Han0 STR_H STR_a STR_n "\0"
#define STRING_Hangul0 STR_H STR_a STR_n STR_g STR_u STR_l "\0"
#define STRING_Hanunoo0 STR_H STR_a STR_n STR_u STR_n STR_o STR_o "\0"
#define STRING_Hebrew0 STR_H STR_e STR_b STR_r STR_e STR_w "\0"
#define STRING_Hiragana0 STR_H STR_i STR_r STR_a STR_g STR_a STR_n STR_a "\0"
#define STRING_Imperial_Aramaic0 STR_I STR_m STR_p STR_e STR_r STR_i STR_a STR_l STR_UNDERSCORE STR_A STR_r STR_a STR_m STR_a STR_i STR_c "\0"
#define STRING_Inherited0 STR_I STR_n STR_h STR_e STR_r STR_i STR_t STR_e STR_d "\0"
#define STRING_Inscriptional_Pahlavi0 STR_I STR_n STR_s STR_c STR_r STR_i STR_p STR_t STR_i STR_o STR_n STR_a STR_l STR_UNDERSCORE STR_P STR_a STR_h STR_l STR_a STR_v STR_i "\0"
#define STRING_Inscriptional_Parthian0 STR_I STR_n STR_s STR_c STR_r STR_i STR_p STR_t STR_i STR_o STR_n STR_a STR_l STR_UNDERSCORE STR_P STR_a STR_r STR_t STR_h STR_i STR_a STR_n "\0"
#define STRING_Javanese0 STR_J STR_a STR_v STR_a STR_n STR_e STR_s STR_e "\0"
#define STRING_Kaithi0 STR_K STR_a STR_i STR_t STR_h STR_i "\0"
#define STRING_Kannada0 STR_K STR_a STR_n STR_n STR_a STR_d STR_a "\0"
#define STRING_Katakana0 STR_K STR_a STR_t STR_a STR_k STR_a STR_n STR_a "\0"
#define STRING_Kayah_Li0 STR_K STR_a STR_y STR_a STR_h STR_UNDERSCORE STR_L STR_i "\0"
#define STRING_Kharoshthi0 STR_K STR_h STR_a STR_r STR_o STR_s STR_h STR_t STR_h STR_i "\0"
#define STRING_Khmer0 STR_K STR_h STR_m STR_e STR_r "\0"
#define STRING_L0 STR_L "\0"
#define STRING_L_AMPERSAND0 STR_L STR_AMPERSAND "\0"
#define STRING_Lao0 STR_L STR_a STR_o "\0"
#define STRING_Latin0 STR_L STR_a STR_t STR_i STR_n "\0"
#define STRING_Lepcha0 STR_L STR_e STR_p STR_c STR_h STR_a "\0"
#define STRING_Limbu0 STR_L STR_i STR_m STR_b STR_u "\0"
#define STRING_Linear_B0 STR_L STR_i STR_n STR_e STR_a STR_r STR_UNDERSCORE STR_B "\0"
#define STRING_Lisu0 STR_L STR_i STR_s STR_u "\0"
#define STRING_Ll0 STR_L STR_l "\0"
#define STRING_Lm0 STR_L STR_m "\0"
#define STRING_Lo0 STR_L STR_o "\0"
#define STRING_Lt0 STR_L STR_t "\0"
#define STRING_Lu0 STR_L STR_u "\0"
#define STRING_Lycian0 STR_L STR_y STR_c STR_i STR_a STR_n "\0"
#define STRING_Lydian0 STR_L STR_y STR_d STR_i STR_a STR_n "\0"
#define STRING_M0 STR_M "\0"
#define STRING_Malayalam0 STR_M STR_a STR_l STR_a STR_y STR_a STR_l STR_a STR_m "\0"
#define STRING_Mc0 STR_M STR_c "\0"
#define STRING_Me0 STR_M STR_e "\0"
#define STRING_Meetei_Mayek0 STR_M STR_e STR_e STR_t STR_e STR_i STR_UNDERSCORE STR_M STR_a STR_y STR_e STR_k "\0"
#define STRING_Mn0 STR_M STR_n "\0"
#define STRING_Mongolian0 STR_M STR_o STR_n STR_g STR_o STR_l STR_i STR_a STR_n "\0"
#define STRING_Myanmar0 STR_M STR_y STR_a STR_n STR_m STR_a STR_r "\0"
#define STRING_N0 STR_N "\0"
#define STRING_Nd0 STR_N STR_d "\0"
#define STRING_New_Tai_Lue0 STR_N STR_e STR_w STR_UNDERSCORE STR_T STR_a STR_i STR_UNDERSCORE STR_L STR_u STR_e "\0"
#define STRING_Nko0 STR_N STR_k STR_o "\0"
#define STRING_Nl0 STR_N STR_l "\0"
#define STRING_No0 STR_N STR_o "\0"
#define STRING_Ogham0 STR_O STR_g STR_h STR_a STR_m "\0"
#define STRING_Ol_Chiki0 STR_O STR_l STR_UNDERSCORE STR_C STR_h STR_i STR_k STR_i "\0"
#define STRING_Old_Italic0 STR_O STR_l STR_d STR_UNDERSCORE STR_I STR_t STR_a STR_l STR_i STR_c "\0"
#define STRING_Old_Persian0 STR_O STR_l STR_d STR_UNDERSCORE STR_P STR_e STR_r STR_s STR_i STR_a STR_n "\0"
#define STRING_Old_South_Arabian0 STR_O STR_l STR_d STR_UNDERSCORE STR_S STR_o STR_u STR_t STR_h STR_UNDERSCORE STR_A STR_r STR_a STR_b STR_i STR_a STR_n "\0"
#define STRING_Old_Turkic0 STR_O STR_l STR_d STR_UNDERSCORE STR_T STR_u STR_r STR_k STR_i STR_c "\0"
#define STRING_Oriya0 STR_O STR_r STR_i STR_y STR_a "\0"
#define STRING_Osmanya0 STR_O STR_s STR_m STR_a STR_n STR_y STR_a "\0"
#define STRING_P0 STR_P "\0"
#define STRING_Pc0 STR_P STR_c "\0"
#define STRING_Pd0 STR_P STR_d "\0"
#define STRING_Pe0 STR_P STR_e "\0"
#define STRING_Pf0 STR_P STR_f "\0"
#define STRING_Phags_Pa0 STR_P STR_h STR_a STR_g STR_s STR_UNDERSCORE STR_P STR_a "\0"
#define STRING_Phoenician0 STR_P STR_h STR_o STR_e STR_n STR_i STR_c STR_i STR_a STR_n "\0"
#define STRING_Pi0 STR_P STR_i "\0"
#define STRING_Po0 STR_P STR_o "\0"
#define STRING_Ps0 STR_P STR_s "\0"
#define STRING_Rejang0 STR_R STR_e STR_j STR_a STR_n STR_g "\0"
#define STRING_Runic0 STR_R STR_u STR_n STR_i STR_c "\0"
#define STRING_S0 STR_S "\0"
#define STRING_Samaritan0 STR_S STR_a STR_m STR_a STR_r STR_i STR_t STR_a STR_n "\0"
#define STRING_Saurashtra0 STR_S STR_a STR_u STR_r STR_a STR_s STR_h STR_t STR_r STR_a "\0"
#define STRING_Sc0 STR_S STR_c "\0"
#define STRING_Shavian0 STR_S STR_h STR_a STR_v STR_i STR_a STR_n "\0"
#define STRING_Sinhala0 STR_S STR_i STR_n STR_h STR_a STR_l STR_a "\0"
#define STRING_Sk0 STR_S STR_k "\0"
#define STRING_Sm0 STR_S STR_m "\0"
#define STRING_So0 STR_S STR_o "\0"
#define STRING_Sundanese0 STR_S STR_u STR_n STR_d STR_a STR_n STR_e STR_s STR_e "\0"
#define STRING_Syloti_Nagri0 STR_S STR_y STR_l STR_o STR_t STR_i STR_UNDERSCORE STR_N STR_a STR_g STR_r STR_i "\0"
#define STRING_Syriac0 STR_S STR_y STR_r STR_i STR_a STR_c "\0"
#define STRING_Tagalog0 STR_T STR_a STR_g STR_a STR_l STR_o STR_g "\0"
#define STRING_Tagbanwa0 STR_T STR_a STR_g STR_b STR_a STR_n STR_w STR_a "\0"
#define STRING_Tai_Le0 STR_T STR_a STR_i STR_UNDERSCORE STR_L STR_e "\0"
#define STRING_Tai_Tham0 STR_T STR_a STR_i STR_UNDERSCORE STR_T STR_h STR_a STR_m "\0"
#define STRING_Tai_Viet0 STR_T STR_a STR_i STR_UNDERSCORE STR_V STR_i STR_e STR_t "\0"
#define STRING_Tamil0 STR_T STR_a STR_m STR_i STR_l "\0"
#define STRING_Telugu0 STR_T STR_e STR_l STR_u STR_g STR_u "\0"
#define STRING_Thaana0 STR_T STR_h STR_a STR_a STR_n STR_a "\0"
#define STRING_Thai0 STR_T STR_h STR_a STR_i "\0"
#define STRING_Tibetan0 STR_T STR_i STR_b STR_e STR_t STR_a STR_n "\0"
#define STRING_Tifinagh0 STR_T STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0"
#define STRING_Ugaritic0 STR_U STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0"
#define STRING_Vai0 STR_V STR_a STR_i "\0"
#define STRING_Yi0 STR_Y STR_i "\0"
#define STRING_Z0 STR_Z "\0"
#define STRING_Zl0 STR_Z STR_l "\0"
#define STRING_Zp0 STR_Z STR_p "\0"
#define STRING_Zs0 STR_Z STR_s "\0"
const char _pcre_utt_names[] = const char _pcre_utt_names[] =
"Any\0" STRING_Any0
"Arabic\0" STRING_Arabic0
"Armenian\0" STRING_Armenian0
"Balinese\0" STRING_Avestan0
"Bengali\0" STRING_Balinese0
"Bopomofo\0" STRING_Bamum0
"Braille\0" STRING_Bengali0
"Buginese\0" STRING_Bopomofo0
"Buhid\0" STRING_Braille0
"C\0" STRING_Buginese0
"Canadian_Aboriginal\0" STRING_Buhid0
"Carian\0" STRING_C0
"Cc\0" STRING_Canadian_Aboriginal0
"Cf\0" STRING_Carian0
"Cham\0" STRING_Cc0
"Cherokee\0" STRING_Cf0
"Cn\0" STRING_Cham0
"Co\0" STRING_Cherokee0
"Common\0" STRING_Cn0
"Coptic\0" STRING_Co0
"Cs\0" STRING_Common0
"Cuneiform\0" STRING_Coptic0
"Cypriot\0" STRING_Cs0
"Cyrillic\0" STRING_Cuneiform0
"Deseret\0" STRING_Cypriot0
"Devanagari\0" STRING_Cyrillic0
"Ethiopic\0" STRING_Deseret0
"Georgian\0" STRING_Devanagari0
"Glagolitic\0" STRING_Egyptian_Hieroglyphs0
"Gothic\0" STRING_Ethiopic0
"Greek\0" STRING_Georgian0
"Gujarati\0" STRING_Glagolitic0
"Gurmukhi\0" STRING_Gothic0
"Han\0" STRING_Greek0
"Hangul\0" STRING_Gujarati0
"Hanunoo\0" STRING_Gurmukhi0
"Hebrew\0" STRING_Han0
"Hiragana\0" STRING_Hangul0
"Inherited\0" STRING_Hanunoo0
"Kannada\0" STRING_Hebrew0
"Katakana\0" STRING_Hiragana0
"Kayah_Li\0" STRING_Imperial_Aramaic0
"Kharoshthi\0" STRING_Inherited0
"Khmer\0" STRING_Inscriptional_Pahlavi0
"L\0" STRING_Inscriptional_Parthian0
"L&\0" STRING_Javanese0
"Lao\0" STRING_Kaithi0
"Latin\0" STRING_Kannada0
"Lepcha\0" STRING_Katakana0
"Limbu\0" STRING_Kayah_Li0
"Linear_B\0" STRING_Kharoshthi0
"Ll\0" STRING_Khmer0
"Lm\0" STRING_L0
"Lo\0" STRING_L_AMPERSAND0
"Lt\0" STRING_Lao0
"Lu\0" STRING_Latin0
"Lycian\0" STRING_Lepcha0
"Lydian\0" STRING_Limbu0
"M\0" STRING_Linear_B0
"Malayalam\0" STRING_Lisu0
"Mc\0" STRING_Ll0
"Me\0" STRING_Lm0
"Mn\0" STRING_Lo0
"Mongolian\0" STRING_Lt0
"Myanmar\0" STRING_Lu0
"N\0" STRING_Lycian0
"Nd\0" STRING_Lydian0
"New_Tai_Lue\0" STRING_M0
"Nko\0" STRING_Malayalam0
"Nl\0" STRING_Mc0
"No\0" STRING_Me0
"Ogham\0" STRING_Meetei_Mayek0
"Ol_Chiki\0" STRING_Mn0
"Old_Italic\0" STRING_Mongolian0
"Old_Persian\0" STRING_Myanmar0
"Oriya\0" STRING_N0
"Osmanya\0" STRING_Nd0
"P\0" STRING_New_Tai_Lue0
"Pc\0" STRING_Nko0
"Pd\0" STRING_Nl0
"Pe\0" STRING_No0
"Pf\0" STRING_Ogham0
"Phags_Pa\0" STRING_Ol_Chiki0
"Phoenician\0" STRING_Old_Italic0
"Pi\0" STRING_Old_Persian0
"Po\0" STRING_Old_South_Arabian0
"Ps\0" STRING_Old_Turkic0
"Rejang\0" STRING_Oriya0
"Runic\0" STRING_Osmanya0
"S\0" STRING_P0
"Saurashtra\0" STRING_Pc0
"Sc\0" STRING_Pd0
"Shavian\0" STRING_Pe0
"Sinhala\0" STRING_Pf0
"Sk\0" STRING_Phags_Pa0
"Sm\0" STRING_Phoenician0
"So\0" STRING_Pi0
"Sundanese\0" STRING_Po0
"Syloti_Nagri\0" STRING_Ps0
"Syriac\0" STRING_Rejang0
"Tagalog\0" STRING_Runic0
"Tagbanwa\0" STRING_S0
"Tai_Le\0" STRING_Samaritan0
"Tamil\0" STRING_Saurashtra0
"Telugu\0" STRING_Sc0
"Thaana\0" STRING_Shavian0
"Thai\0" STRING_Sinhala0
"Tibetan\0" STRING_Sk0
"Tifinagh\0" STRING_Sm0
"Ugaritic\0" STRING_So0
"Vai\0" STRING_Sundanese0
"Yi\0" STRING_Syloti_Nagri0
"Z\0" STRING_Syriac0
"Zl\0" STRING_Tagalog0
"Zp\0" STRING_Tagbanwa0
"Zs\0"; STRING_Tai_Le0
STRING_Tai_Tham0
STRING_Tai_Viet0
STRING_Tamil0
STRING_Telugu0
STRING_Thaana0
STRING_Thai0
STRING_Tibetan0
STRING_Tifinagh0
STRING_Ugaritic0
STRING_Vai0
STRING_Yi0
STRING_Z0
STRING_Zl0
STRING_Zp0
STRING_Zs0;
const ucp_type_table _pcre_utt[] = { const ucp_type_table _pcre_utt[] = {
{ 0, PT_ANY, 0 }, { 0, PT_ANY, 0 },
{ 4, PT_SC, ucp_Arabic }, { 4, PT_SC, ucp_Arabic },
{ 11, PT_SC, ucp_Armenian }, { 11, PT_SC, ucp_Armenian },
{ 20, PT_SC, ucp_Balinese }, { 20, PT_SC, ucp_Avestan },
{ 29, PT_SC, ucp_Bengali }, { 28, PT_SC, ucp_Balinese },
{ 37, PT_SC, ucp_Bopomofo }, { 37, PT_SC, ucp_Bamum },
{ 46, PT_SC, ucp_Braille }, { 43, PT_SC, ucp_Bengali },
{ 54, PT_SC, ucp_Buginese }, { 51, PT_SC, ucp_Bopomofo },
{ 63, PT_SC, ucp_Buhid }, { 60, PT_SC, ucp_Braille },
{ 69, PT_GC, ucp_C }, { 68, PT_SC, ucp_Buginese },
{ 71, PT_SC, ucp_Canadian_Aboriginal }, { 77, PT_SC, ucp_Buhid },
{ 91, PT_SC, ucp_Carian }, { 83, PT_GC, ucp_C },
{ 98, PT_PC, ucp_Cc }, { 85, PT_SC, ucp_Canadian_Aboriginal },
{ 101, PT_PC, ucp_Cf }, { 105, PT_SC, ucp_Carian },
{ 104, PT_SC, ucp_Cham }, { 112, PT_PC, ucp_Cc },
{ 109, PT_SC, ucp_Cherokee }, { 115, PT_PC, ucp_Cf },
{ 118, PT_PC, ucp_Cn }, { 118, PT_SC, ucp_Cham },
{ 121, PT_PC, ucp_Co }, { 123, PT_SC, ucp_Cherokee },
{ 124, PT_SC, ucp_Common }, { 132, PT_PC, ucp_Cn },
{ 131, PT_SC, ucp_Coptic }, { 135, PT_PC, ucp_Co },
{ 138, PT_PC, ucp_Cs }, { 138, PT_SC, ucp_Common },
{ 141, PT_SC, ucp_Cuneiform }, { 145, PT_SC, ucp_Coptic },
{ 151, PT_SC, ucp_Cypriot }, { 152, PT_PC, ucp_Cs },
{ 159, PT_SC, ucp_Cyrillic }, { 155, PT_SC, ucp_Cuneiform },
{ 168, PT_SC, ucp_Deseret }, { 165, PT_SC, ucp_Cypriot },
{ 176, PT_SC, ucp_Devanagari }, { 173, PT_SC, ucp_Cyrillic },
{ 187, PT_SC, ucp_Ethiopic }, { 182, PT_SC, ucp_Deseret },
{ 196, PT_SC, ucp_Georgian }, { 190, PT_SC, ucp_Devanagari },
{ 205, PT_SC, ucp_Glagolitic }, { 201, PT_SC, ucp_Egyptian_Hieroglyphs },
{ 216, PT_SC, ucp_Gothic }, { 222, PT_SC, ucp_Ethiopic },
{ 223, PT_SC, ucp_Greek }, { 231, PT_SC, ucp_Georgian },
{ 229, PT_SC, ucp_Gujarati }, { 240, PT_SC, ucp_Glagolitic },
{ 238, PT_SC, ucp_Gurmukhi }, { 251, PT_SC, ucp_Gothic },
{ 247, PT_SC, ucp_Han }, { 258, PT_SC, ucp_Greek },
{ 251, PT_SC, ucp_Hangul }, { 264, PT_SC, ucp_Gujarati },
{ 258, PT_SC, ucp_Hanunoo }, { 273, PT_SC, ucp_Gurmukhi },
{ 266, PT_SC, ucp_Hebrew }, { 282, PT_SC, ucp_Han },
{ 273, PT_SC, ucp_Hiragana }, { 286, PT_SC, ucp_Hangul },
{ 282, PT_SC, ucp_Inherited }, { 293, PT_SC, ucp_Hanunoo },
{ 292, PT_SC, ucp_Kannada }, { 301, PT_SC, ucp_Hebrew },
{ 300, PT_SC, ucp_Katakana }, { 308, PT_SC, ucp_Hiragana },
{ 309, PT_SC, ucp_Kayah_Li }, { 317, PT_SC, ucp_Imperial_Aramaic },
{ 318, PT_SC, ucp_Kharoshthi }, { 334, PT_SC, ucp_Inherited },
{ 329, PT_SC, ucp_Khmer }, { 344, PT_SC, ucp_Inscriptional_Pahlavi },
{ 335, PT_GC, ucp_L }, { 366, PT_SC, ucp_Inscriptional_Parthian },
{ 337, PT_LAMP, 0 }, { 389, PT_SC, ucp_Javanese },
{ 340, PT_SC, ucp_Lao }, { 398, PT_SC, ucp_Kaithi },
{ 344, PT_SC, ucp_Latin }, { 405, PT_SC, ucp_Kannada },
{ 350, PT_SC, ucp_Lepcha }, { 413, PT_SC, ucp_Katakana },
{ 357, PT_SC, ucp_Limbu }, { 422, PT_SC, ucp_Kayah_Li },
{ 363, PT_SC, ucp_Linear_B }, { 431, PT_SC, ucp_Kharoshthi },
{ 372, PT_PC, ucp_Ll }, { 442, PT_SC, ucp_Khmer },
{ 375, PT_PC, ucp_Lm }, { 448, PT_GC, ucp_L },
{ 378, PT_PC, ucp_Lo }, { 450, PT_LAMP, 0 },
{ 381, PT_PC, ucp_Lt }, { 453, PT_SC, ucp_Lao },
{ 384, PT_PC, ucp_Lu }, { 457, PT_SC, ucp_Latin },
{ 387, PT_SC, ucp_Lycian }, { 463, PT_SC, ucp_Lepcha },
{ 394, PT_SC, ucp_Lydian }, { 470, PT_SC, ucp_Limbu },
{ 401, PT_GC, ucp_M }, { 476, PT_SC, ucp_Linear_B },
{ 403, PT_SC, ucp_Malayalam }, { 485, PT_SC, ucp_Lisu },
{ 413, PT_PC, ucp_Mc }, { 490, PT_PC, ucp_Ll },
{ 416, PT_PC, ucp_Me }, { 493, PT_PC, ucp_Lm },
{ 419, PT_PC, ucp_Mn }, { 496, PT_PC, ucp_Lo },
{ 422, PT_SC, ucp_Mongolian }, { 499, PT_PC, ucp_Lt },
{ 432, PT_SC, ucp_Myanmar }, { 502, PT_PC, ucp_Lu },
{ 440, PT_GC, ucp_N }, { 505, PT_SC, ucp_Lycian },
{ 442, PT_PC, ucp_Nd }, { 512, PT_SC, ucp_Lydian },
{ 445, PT_SC, ucp_New_Tai_Lue }, { 519, PT_GC, ucp_M },
{ 457, PT_SC, ucp_Nko }, { 521, PT_SC, ucp_Malayalam },
{ 461, PT_PC, ucp_Nl }, { 531, PT_PC, ucp_Mc },
{ 464, PT_PC, ucp_No }, { 534, PT_PC, ucp_Me },
{ 467, PT_SC, ucp_Ogham }, { 537, PT_SC, ucp_Meetei_Mayek },
{ 473, PT_SC, ucp_Ol_Chiki }, { 550, PT_PC, ucp_Mn },
{ 482, PT_SC, ucp_Old_Italic }, { 553, PT_SC, ucp_Mongolian },
{ 493, PT_SC, ucp_Old_Persian }, { 563, PT_SC, ucp_Myanmar },
{ 505, PT_SC, ucp_Oriya }, { 571, PT_GC, ucp_N },
{ 511, PT_SC, ucp_Osmanya }, { 573, PT_PC, ucp_Nd },
{ 519, PT_GC, ucp_P }, { 576, PT_SC, ucp_New_Tai_Lue },
{ 521, PT_PC, ucp_Pc }, { 588, PT_SC, ucp_Nko },
{ 524, PT_PC, ucp_Pd }, { 592, PT_PC, ucp_Nl },
{ 527, PT_PC, ucp_Pe }, { 595, PT_PC, ucp_No },
{ 530, PT_PC, ucp_Pf }, { 598, PT_SC, ucp_Ogham },
{ 533, PT_SC, ucp_Phags_Pa }, { 604, PT_SC, ucp_Ol_Chiki },
{ 542, PT_SC, ucp_Phoenician }, { 613, PT_SC, ucp_Old_Italic },
{ 553, PT_PC, ucp_Pi }, { 624, PT_SC, ucp_Old_Persian },
{ 556, PT_PC, ucp_Po }, { 636, PT_SC, ucp_Old_South_Arabian },
{ 559, PT_PC, ucp_Ps }, { 654, PT_SC, ucp_Old_Turkic },
{ 562, PT_SC, ucp_Rejang }, { 665, PT_SC, ucp_Oriya },
{ 569, PT_SC, ucp_Runic }, { 671, PT_SC, ucp_Osmanya },
{ 575, PT_GC, ucp_S }, { 679, PT_GC, ucp_P },
{ 577, PT_SC, ucp_Saurashtra }, { 681, PT_PC, ucp_Pc },
{ 588, PT_PC, ucp_Sc }, { 684, PT_PC, ucp_Pd },
{ 591, PT_SC, ucp_Shavian }, { 687, PT_PC, ucp_Pe },
{ 599, PT_SC, ucp_Sinhala }, { 690, PT_PC, ucp_Pf },
{ 607, PT_PC, ucp_Sk }, { 693, PT_SC, ucp_Phags_Pa },
{ 610, PT_PC, ucp_Sm }, { 702, PT_SC, ucp_Phoenician },
{ 613, PT_PC, ucp_So }, { 713, PT_PC, ucp_Pi },
{ 616, PT_SC, ucp_Sundanese }, { 716, PT_PC, ucp_Po },
{ 626, PT_SC, ucp_Syloti_Nagri }, { 719, PT_PC, ucp_Ps },
{ 639, PT_SC, ucp_Syriac }, { 722, PT_SC, ucp_Rejang },
{ 646, PT_SC, ucp_Tagalog }, { 729, PT_SC, ucp_Runic },
{ 654, PT_SC, ucp_Tagbanwa }, { 735, PT_GC, ucp_S },
{ 663, PT_SC, ucp_Tai_Le }, { 737, PT_SC, ucp_Samaritan },
{ 670, PT_SC, ucp_Tamil }, { 747, PT_SC, ucp_Saurashtra },
{ 676, PT_SC, ucp_Telugu }, { 758, PT_PC, ucp_Sc },
{ 683, PT_SC, ucp_Thaana }, { 761, PT_SC, ucp_Shavian },
{ 690, PT_SC, ucp_Thai }, { 769, PT_SC, ucp_Sinhala },
{ 695, PT_SC, ucp_Tibetan }, { 777, PT_PC, ucp_Sk },
{ 703, PT_SC, ucp_Tifinagh }, { 780, PT_PC, ucp_Sm },
{ 712, PT_SC, ucp_Ugaritic }, { 783, PT_PC, ucp_So },
{ 721, PT_SC, ucp_Vai }, { 786, PT_SC, ucp_Sundanese },
{ 725, PT_SC, ucp_Yi }, { 796, PT_SC, ucp_Syloti_Nagri },
{ 728, PT_GC, ucp_Z }, { 809, PT_SC, ucp_Syriac },
{ 730, PT_PC, ucp_Zl }, { 816, PT_SC, ucp_Tagalog },
{ 733, PT_PC, ucp_Zp }, { 824, PT_SC, ucp_Tagbanwa },
{ 736, PT_PC, ucp_Zs } { 833, PT_SC, ucp_Tai_Le },
{ 840, PT_SC, ucp_Tai_Tham },
{ 849, PT_SC, ucp_Tai_Viet },
{ 858, PT_SC, ucp_Tamil },
{ 864, PT_SC, ucp_Telugu },
{ 871, PT_SC, ucp_Thaana },
{ 878, PT_SC, ucp_Thai },
{ 883, PT_SC, ucp_Tibetan },
{ 891, PT_SC, ucp_Tifinagh },
{ 900, PT_SC, ucp_Ugaritic },
{ 909, PT_SC, ucp_Vai },
{ 913, PT_SC, ucp_Yi },
{ 916, PT_GC, ucp_Z },
{ 918, PT_PC, ucp_Zl },
{ 921, PT_PC, ucp_Zp },
{ 924, PT_PC, ucp_Zs }
}; };
const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table); const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table);

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language. and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Copyright (c) 1997-2008 University of Cambridge Copyright (c) 1997-2009 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -128,7 +128,9 @@ if (study != NULL)
{ {
*internal_study = *study; /* To copy other fields */ *internal_study = *study; /* To copy other fields */
internal_study->size = byteflip(study->size, sizeof(study->size)); internal_study->size = byteflip(study->size, sizeof(study->size));
internal_study->options = byteflip(study->options, sizeof(study->options)); internal_study->flags = byteflip(study->flags, sizeof(study->flags));
internal_study->minlength = byteflip(study->minlength,
sizeof(study->minlength));
} }
return internal_re; return internal_re;

View File

@ -52,6 +52,50 @@ properties. */
#include "ucp.h" /* Category definitions */ #include "ucp.h" /* Category definitions */
/* Table to translate from particular type value to the general value. */
static int ucp_gentype[] = {
ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */
ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */
ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */
ucp_N, ucp_N, ucp_N, /* Nd, Nl, No */
ucp_P, ucp_P, ucp_P, ucp_P, ucp_P, /* Pc, Pd, Pe, Pf, Pi */
ucp_P, ucp_P, /* Ps, Po */
ucp_S, ucp_S, ucp_S, ucp_S, /* Sc, Sk, Sm, So */
ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */
};
/*************************************************
* Search table and return type *
*************************************************/
/* Three values are returned: the category is ucp_C, ucp_L, etc. The detailed
character type is ucp_Lu, ucp_Nd, etc. The script is ucp_Latin, etc.
Arguments:
c the character value
type_ptr the detailed character type is returned here
script_ptr the script is returned here
Returns: the character type category
*/
int
_pcre_ucp_findprop(const unsigned int c, int *type_ptr, int *script_ptr)
{
/* Note that the Unicode types have the same values in glib and in
* PCRE, so ucp_Ll == G_UNICODE_LOWERCASE_LETTER,
* ucp_Zs == G_UNICODE_SPACE_SEPARATOR, and so on. */
*type_ptr = g_unichar_type(c);
*script_ptr = g_unichar_get_script(c);
return ucp_gentype[*type_ptr];
}
/************************************************* /*************************************************
* Search table and return other case * * Search table and return other case *
*************************************************/ *************************************************/
@ -68,7 +112,7 @@ Returns: the other case or NOTACHAR if none
unsigned int unsigned int
_pcre_ucp_othercase(const unsigned int c) _pcre_ucp_othercase(const unsigned int c)
{ {
unsigned int other_case = NOTACHAR; int other_case = NOTACHAR;
if (g_unichar_islower(c)) if (g_unichar_islower(c))
other_case = g_unichar_toupper(c); other_case = g_unichar_toupper(c);

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language. and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel Written by Philip Hazel
Copyright (c) 1997-2008 University of Cambridge Copyright (c) 1997-2009 University of Cambridge
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -39,8 +39,7 @@ POSSIBILITY OF SUCH DAMAGE.
/* This module contains an internal function that is used to match an extended /* This module contains an internal function that is used to match an extended
class (one that contains characters whose values are > 255). It is used by both class. It is used by both pcre_exec() and pcre_def_exec(). */
pcre_exec() and pcre_def_exec(). */
#ifdef HAVE_CONFIG_H #ifdef HAVE_CONFIG_H
@ -55,7 +54,7 @@ pcre_exec() and pcre_def_exec(). */
*************************************************/ *************************************************/
/* This function is called to match a character against an extended class that /* This function is called to match a character against an extended class that
might contain values > 255. might contain values > 255 and/or Unicode properties.
Arguments: Arguments:
c the character c the character

View File

@ -6,9 +6,8 @@
#define _UCP_H #define _UCP_H
/* This file contains definitions of the property values that are returned by /* This file contains definitions of the property values that are returned by
the function _pcre_ucp_findprop(). New values that are added for new releases the UCD access macros. New values that are added for new releases of Unicode
of Unicode should always be at the end of each enum, for backwards should always be at the end of each enum, for backwards compatibility. */
compatibility. */
/* These are the general character categories. */ /* These are the general character categories. */
@ -121,24 +120,40 @@ enum {
ucp_Tifinagh = G_UNICODE_SCRIPT_TIFINAGH, ucp_Tifinagh = G_UNICODE_SCRIPT_TIFINAGH,
ucp_Ugaritic = G_UNICODE_SCRIPT_UGARITIC, ucp_Ugaritic = G_UNICODE_SCRIPT_UGARITIC,
ucp_Yi = G_UNICODE_SCRIPT_YI, ucp_Yi = G_UNICODE_SCRIPT_YI,
ucp_Balinese = G_UNICODE_SCRIPT_BALINESE, /* New for Unicode 5.0.0 */ ucp_Balinese = G_UNICODE_SCRIPT_BALINESE,
ucp_Cuneiform = G_UNICODE_SCRIPT_CUNEIFORM, /* New for Unicode 5.0.0 */ ucp_Cuneiform = G_UNICODE_SCRIPT_CUNEIFORM,
ucp_Nko = G_UNICODE_SCRIPT_NKO, /* New for Unicode 5.0.0 */ ucp_Nko = G_UNICODE_SCRIPT_NKO,
ucp_Phags_Pa = G_UNICODE_SCRIPT_PHAGS_PA, /* New for Unicode 5.0.0 */ ucp_Phags_Pa = G_UNICODE_SCRIPT_PHAGS_PA,
ucp_Phoenician = G_UNICODE_SCRIPT_PHOENICIAN, /* New for Unicode 5.0.0 */ ucp_Phoenician = G_UNICODE_SCRIPT_PHOENICIAN,
ucp_Carian = G_UNICODE_SCRIPT_CARIAN, /* New for Unicode 5.1 */ ucp_Carian = G_UNICODE_SCRIPT_CARIAN,
ucp_Cham = G_UNICODE_SCRIPT_CHAM, /* New for Unicode 5.1 */ ucp_Cham = G_UNICODE_SCRIPT_CHAM,
ucp_Kayah_Li = G_UNICODE_SCRIPT_KAYAH_LI, /* New for Unicode 5.1 */ ucp_Kayah_Li = G_UNICODE_SCRIPT_KAYAH_LI,
ucp_Lepcha = G_UNICODE_SCRIPT_LEPCHA, /* New for Unicode 5.1 */ ucp_Lepcha = G_UNICODE_SCRIPT_LEPCHA,
ucp_Lycian = G_UNICODE_SCRIPT_LYCIAN, /* New for Unicode 5.1 */ ucp_Lycian = G_UNICODE_SCRIPT_LYCIAN,
ucp_Lydian = G_UNICODE_SCRIPT_LYDIAN, /* New for Unicode 5.1 */ ucp_Lydian = G_UNICODE_SCRIPT_LYDIAN,
ucp_Ol_Chiki = G_UNICODE_SCRIPT_OL_CHIKI, /* New for Unicode 5.1 */ ucp_Ol_Chiki = G_UNICODE_SCRIPT_OL_CHIKI,
ucp_Rejang = G_UNICODE_SCRIPT_REJANG, /* New for Unicode 5.1 */ ucp_Rejang = G_UNICODE_SCRIPT_REJANG,
ucp_Saurashtra = G_UNICODE_SCRIPT_SAURASHTRA, /* New for Unicode 5.1 */ ucp_Saurashtra = G_UNICODE_SCRIPT_SAURASHTRA,
ucp_Sundanese = G_UNICODE_SCRIPT_SUNDANESE, /* New for Unicode 5.1 */ ucp_Sundanese = G_UNICODE_SCRIPT_SUNDANESE,
ucp_Vai = G_UNICODE_SCRIPT_VAI /* New for Unicode 5.1 */ ucp_Vai = G_UNICODE_SCRIPT_VAI,
ucp_Avestan = G_UNICODE_SCRIPT_AVESTAN,
ucp_Bamum = G_UNICODE_SCRIPT_BAMUM,
ucp_Egyptian_Hieroglyphs = G_UNICODE_SCRIPT_EGYPTIAN_HIEROGLYPHS,
ucp_Imperial_Aramaic = G_UNICODE_SCRIPT_IMPERIAL_ARAMAIC,
ucp_Inscriptional_Pahlavi = G_UNICODE_SCRIPT_INSCRIPTIONAL_PAHLAVI,
ucp_Inscriptional_Parthian = G_UNICODE_SCRIPT_INSCRIPTIONAL_PARTHIAN,
ucp_Javanese = G_UNICODE_SCRIPT_JAVANESE,
ucp_Kaithi = G_UNICODE_SCRIPT_KAITHI,
ucp_Lisu = G_UNICODE_SCRIPT_LISU,
ucp_Meetei_Mayek = G_UNICODE_SCRIPT_MEETEI_MAYEK,
ucp_Old_South_Arabian = G_UNICODE_SCRIPT_OLD_SOUTH_ARABIAN,
ucp_Old_Turkic = G_UNICODE_SCRIPT_OLD_TURKISH,
ucp_Samaritan = G_UNICODE_SCRIPT_SAMARITAN,
ucp_Tai_Tham = G_UNICODE_SCRIPT_TAI_THAM,
ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET
}; };
#endif #endif
/* End of ucp.h */ /* End of ucp.h */

View File

@ -1,92 +0,0 @@
/*************************************************
* Unicode Property Table handler *
*************************************************/
#ifndef _UCPINTERNAL_H
#define _UCPINTERNAL_H
/* Internal header file defining the layout of the bits in each pair of 32-bit
words that form a data item in the table. */
typedef struct cnode {
pcre_uint32 f0;
pcre_uint32 f1;
} cnode;
/* Things for the f0 field */
#define f0_scriptmask 0xff000000 /* Mask for script field */
#define f0_scriptshift 24 /* Shift for script value */
#define f0_rangeflag 0x00800000 /* Flag for a range item */
#define f0_charmask 0x001fffff /* Mask for code point value */
/* Things for the f1 field */
#define f1_typemask 0xfc000000 /* Mask for char type field */
#define f1_typeshift 26 /* Shift for the type field */
#define f1_rangemask 0x0000ffff /* Mask for a range offset */
#define f1_casemask 0x0000ffff /* Mask for a case offset */
#define f1_caseneg 0xffff8000 /* Bits for negation */
/* The data consists of a vector of structures of type cnode. The two unsigned
32-bit integers are used as follows:
(f0) (1) The most significant byte holds the script number. The numbers are
defined by the enum in ucp.h.
(2) The 0x00800000 bit is set if this entry defines a range of characters.
It is not set if this entry defines a single character
(3) The 0x00600000 bits are spare.
(4) The 0x001fffff bits contain the code point. No Unicode code point will
ever be greater than 0x0010ffff, so this should be OK for ever.
(f1) (1) The 0xfc000000 bits contain the character type number. The numbers are
defined by an enum in ucp.h.
(2) The 0x03ff0000 bits are spare.
(3) The 0x0000ffff bits contain EITHER the unsigned offset to the top of
range if this entry defines a range, OR the *signed* offset to the
character's "other case" partner if this entry defines a single
character. There is no partner if the value is zero.
-------------------------------------------------------------------------------
| script (8) |.|.|.| codepoint (21) || type (6) |.|.| spare (8) | offset (16) |
-------------------------------------------------------------------------------
| | | | |
| | |-> spare | |-> spare
| | |
| |-> spare |-> spare
|
|-> range flag
The upper/lower casing information is set only for characters that come in
pairs. The non-one-to-one mappings in the Unicode data are ignored.
When searching the data, proceed as follows:
(1) Set up for a binary chop search.
(2) If the top is not greater than the bottom, the character is not in the
table. Its type must therefore be "Cn" ("Undefined").
(3) Find the middle vector element.
(4) Extract the code point and compare. If equal, we are done.
(5) If the test character is smaller, set the top to the current point, and
goto (2).
(6) If the current entry defines a range, compute the last character by adding
the offset, and see if the test character is within the range. If it is,
we are done.
(7) Otherwise, set the bottom to one element past the current point and goto
(2).
*/
#endif /* _UCPINTERNAL_H */
/* End of ucpinternal.h */