Forgotten files

This commit is contained in:
Matthias Clasen 2011-01-22 00:01:54 -05:00
parent 3f059a6a12
commit fb2809ec99
10 changed files with 2273 additions and 1001 deletions

View File

@ -5,7 +5,7 @@
/* This is the public header file for the PCRE library, to be #included by
applications that call the PCRE functions.
Copyright (c) 1997-2009 University of Cambridge
Copyright (c) 1997-2010 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
/* The current PCRE version information. */
#define PCRE_MAJOR 8
#define PCRE_MINOR 02
#define PCRE_MINOR 12
#define PCRE_PRERELEASE
#define PCRE_DATE 2010-03-19
#define PCRE_DATE 2011-01-15
/* When an application links to a PCRE DLL in Windows, the symbols that are
imported have to be identified as such. When building PCRE, the appropriate
@ -96,41 +96,44 @@ extern "C" {
#endif
/* Options. Some are compile-time only, some are run-time only, and some are
both, so we keep them all distinct. */
both, so we keep them all distinct. However, almost all the bits in the options
word are now used. In the long run, we may have to re-use some of the
compile-time only bits for runtime options, or vice versa. */
#define PCRE_CASELESS 0x00000001
#define PCRE_MULTILINE 0x00000002
#define PCRE_DOTALL 0x00000004
#define PCRE_EXTENDED 0x00000008
#define PCRE_ANCHORED 0x00000010
#define PCRE_DOLLAR_ENDONLY 0x00000020
#define PCRE_EXTRA 0x00000040
#define PCRE_NOTBOL 0x00000080
#define PCRE_NOTEOL 0x00000100
#define PCRE_UNGREEDY 0x00000200
#define PCRE_NOTEMPTY 0x00000400
#define PCRE_UTF8 0x00000800
#define PCRE_NO_AUTO_CAPTURE 0x00001000
#define PCRE_NO_UTF8_CHECK 0x00002000
#define PCRE_AUTO_CALLOUT 0x00004000
#define PCRE_PARTIAL_SOFT 0x00008000
#define PCRE_CASELESS 0x00000001 /* Compile */
#define PCRE_MULTILINE 0x00000002 /* Compile */
#define PCRE_DOTALL 0x00000004 /* Compile */
#define PCRE_EXTENDED 0x00000008 /* Compile */
#define PCRE_ANCHORED 0x00000010 /* Compile, exec, DFA exec */
#define PCRE_DOLLAR_ENDONLY 0x00000020 /* Compile */
#define PCRE_EXTRA 0x00000040 /* Compile */
#define PCRE_NOTBOL 0x00000080 /* Exec, DFA exec */
#define PCRE_NOTEOL 0x00000100 /* Exec, DFA exec */
#define PCRE_UNGREEDY 0x00000200 /* Compile */
#define PCRE_NOTEMPTY 0x00000400 /* Exec, DFA exec */
#define PCRE_UTF8 0x00000800 /* Compile */
#define PCRE_NO_AUTO_CAPTURE 0x00001000 /* Compile */
#define PCRE_NO_UTF8_CHECK 0x00002000 /* Compile, exec, DFA exec */
#define PCRE_AUTO_CALLOUT 0x00004000 /* Compile */
#define PCRE_PARTIAL_SOFT 0x00008000 /* Exec, DFA exec */
#define PCRE_PARTIAL 0x00008000 /* Backwards compatible synonym */
#define PCRE_DFA_SHORTEST 0x00010000
#define PCRE_DFA_RESTART 0x00020000
#define PCRE_FIRSTLINE 0x00040000
#define PCRE_DUPNAMES 0x00080000
#define PCRE_NEWLINE_CR 0x00100000
#define PCRE_NEWLINE_LF 0x00200000
#define PCRE_NEWLINE_CRLF 0x00300000
#define PCRE_NEWLINE_ANY 0x00400000
#define PCRE_NEWLINE_ANYCRLF 0x00500000
#define PCRE_BSR_ANYCRLF 0x00800000
#define PCRE_BSR_UNICODE 0x01000000
#define PCRE_JAVASCRIPT_COMPAT 0x02000000
#define PCRE_NO_START_OPTIMIZE 0x04000000
#define PCRE_NO_START_OPTIMISE 0x04000000
#define PCRE_PARTIAL_HARD 0x08000000
#define PCRE_NOTEMPTY_ATSTART 0x10000000
#define PCRE_DFA_SHORTEST 0x00010000 /* DFA exec */
#define PCRE_DFA_RESTART 0x00020000 /* DFA exec */
#define PCRE_FIRSTLINE 0x00040000 /* Compile */
#define PCRE_DUPNAMES 0x00080000 /* Compile */
#define PCRE_NEWLINE_CR 0x00100000 /* Compile, exec, DFA exec */
#define PCRE_NEWLINE_LF 0x00200000 /* Compile, exec, DFA exec */
#define PCRE_NEWLINE_CRLF 0x00300000 /* Compile, exec, DFA exec */
#define PCRE_NEWLINE_ANY 0x00400000 /* Compile, exec, DFA exec */
#define PCRE_NEWLINE_ANYCRLF 0x00500000 /* Compile, exec, DFA exec */
#define PCRE_BSR_ANYCRLF 0x00800000 /* Compile, exec, DFA exec */
#define PCRE_BSR_UNICODE 0x01000000 /* Compile, exec, DFA exec */
#define PCRE_JAVASCRIPT_COMPAT 0x02000000 /* Compile */
#define PCRE_NO_START_OPTIMIZE 0x04000000 /* Compile, exec, DFA exec */
#define PCRE_NO_START_OPTIMISE 0x04000000 /* Synonym */
#define PCRE_PARTIAL_HARD 0x08000000 /* Exec, DFA exec */
#define PCRE_NOTEMPTY_ATSTART 0x10000000 /* Exec, DFA exec */
#define PCRE_UCP 0x20000000 /* Compile */
/* Exec-time and get/set-time error codes */
@ -158,6 +161,8 @@ both, so we keep them all distinct. */
#define PCRE_ERROR_RECURSIONLIMIT (-21)
#define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */
#define PCRE_ERROR_BADNEWLINE (-23)
#define PCRE_ERROR_BADOFFSET (-24)
#define PCRE_ERROR_SHORTUTF8 (-25)
/* Request types for pcre_fullinfo() */
@ -200,6 +205,7 @@ these bits, just add new ones on the end, in order to remain compatible. */
#define PCRE_EXTRA_CALLOUT_DATA 0x0004
#define PCRE_EXTRA_TABLES 0x0008
#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010
#define PCRE_EXTRA_MARK 0x0020
/* Types */
@ -225,6 +231,7 @@ typedef struct pcre_extra {
void *callout_data; /* Data passed back in callouts */
const unsigned char *tables; /* Pointer to character tables */
unsigned long int match_limit_recursion; /* Max recursive calls to match() */
unsigned char **mark; /* For passing back a mark pointer */
} pcre_extra;
/* The structure for passing out data via the pcre_callout_function. We use a

View File

@ -14,7 +14,7 @@ example ISO-8859-1. When dftables is run, it creates these tables in the
current locale. If PCRE is configured with --enable-rebuild-chartables, this
happens automatically.
The following #includes are present because without the gcc 4.x may remove the
The following #includes are present because without them gcc 4.x may remove the
array definition from the final binary if PCRE is built into a static library
and dead code stripping is activated. This leads to link errors. Pulling in the
header ensures that the array gets flagged as "someone outside this compilation

File diff suppressed because it is too large Load Diff

View File

@ -106,7 +106,7 @@ never stored, so we push them well clear of the normal opcodes. */
/* This table identifies those opcodes that are followed immediately by a
character that is to be tested in some way. This makes is possible to
character that is to be tested in some way. This makes it possible to
centralize the loading of these characters. In the case of Type * etc, the
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
small value. Non-zero values in the table are the offsets from the opcode where
@ -161,8 +161,9 @@ static const uschar coptable[] = {
0, 0, /* RREF, NRREF */
0, /* DEF */
0, 0, /* BRAZERO, BRAMINZERO */
0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
};
/* This table identifies those opcodes that inspect a character. It is used to
@ -218,8 +219,9 @@ static const uschar poptable[] = {
0, 0, /* RREF, NRREF */
0, /* DEF */
0, 0, /* BRAZERO, BRAMINZERO */
0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
};
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
@ -473,7 +475,7 @@ if (*first_op == OP_REVERSE)
{
gone_back = (current_subject - max_back < start_subject)?
current_subject - start_subject : max_back;
(int)(current_subject - start_subject) : max_back;
current_subject -= gone_back;
}
@ -490,7 +492,7 @@ if (*first_op == OP_REVERSE)
int back = GET(end_code, 2+LINK_SIZE);
if (back <= gone_back)
{
int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
ADD_NEW_DATA(-bstate, 0, gone_back - back);
}
end_code += GET(end_code, 1);
@ -526,7 +528,7 @@ else
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
do
{
ADD_NEW(end_code - start_code + length, 0);
ADD_NEW((int)(end_code - start_code + length), 0);
end_code += GET(end_code, 1);
length = 1 + LINK_SIZE;
}
@ -753,8 +755,8 @@ for (;;)
if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
if (offsetcount >= 2)
{
offsets[0] = current_subject - start_subject;
offsets[1] = ptr - start_subject;
offsets[0] = (int)(current_subject - start_subject);
offsets[1] = (int)(ptr - start_subject);
DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
offsets[1] - offsets[0], current_subject));
}
@ -776,7 +778,7 @@ for (;;)
/*-----------------------------------------------------------------*/
case OP_ALT:
do { code += GET(code, 1); } while (*code == OP_ALT);
ADD_ACTIVE(code - start_code, 0);
ADD_ACTIVE((int)(code - start_code), 0);
break;
/*-----------------------------------------------------------------*/
@ -784,7 +786,7 @@ for (;;)
case OP_SBRA:
do
{
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
code += GET(code, 1);
}
while (*code == OP_ALT);
@ -793,11 +795,11 @@ for (;;)
/*-----------------------------------------------------------------*/
case OP_CBRA:
case OP_SCBRA:
ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE), 0);
code += GET(code, 1);
while (*code == OP_ALT)
{
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
code += GET(code, 1);
}
break;
@ -808,14 +810,14 @@ for (;;)
ADD_ACTIVE(state_offset + 1, 0);
code += 1 + GET(code, 2);
while (*code == OP_ALT) code += GET(code, 1);
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
break;
/*-----------------------------------------------------------------*/
case OP_SKIPZERO:
code += 1 + GET(code, 2);
while (*code == OP_ALT) code += GET(code, 1);
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
break;
/*-----------------------------------------------------------------*/
@ -829,7 +831,12 @@ for (;;)
/*-----------------------------------------------------------------*/
case OP_EOD:
if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
if (ptr >= end_subject)
{
if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
could_continue = TRUE;
else { ADD_ACTIVE(state_offset + 1, 0); }
}
break;
/*-----------------------------------------------------------------*/
@ -869,7 +876,9 @@ for (;;)
/*-----------------------------------------------------------------*/
case OP_EODN:
if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
could_continue = TRUE;
else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
{ ADD_ACTIVE(state_offset + 1, 0); }
break;
@ -877,7 +886,9 @@ for (;;)
case OP_DOLL:
if ((md->moptions & PCRE_NOTEOL) == 0)
{
if (clen == 0 ||
if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
could_continue = TRUE;
else if (clen == 0 ||
((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
))
@ -920,13 +931,37 @@ for (;;)
if (utf8) BACKCHAR(temp);
#endif
GETCHARTEST(d, temp);
#ifdef SUPPORT_UCP
if ((md->poptions & PCRE_UCP) != 0)
{
if (d == '_') left_word = TRUE; else
{
int cat = UCD_CATEGORY(d);
left_word = (cat == ucp_L || cat == ucp_N);
}
}
else
#endif
left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
}
else left_word = 0;
else left_word = FALSE;
if (clen > 0)
{
#ifdef SUPPORT_UCP
if ((md->poptions & PCRE_UCP) != 0)
{
if (c == '_') right_word = TRUE; else
{
int cat = UCD_CATEGORY(c);
right_word = (cat == ucp_L || cat == ucp_N);
}
}
else
#endif
right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
else right_word = 0;
}
else right_word = FALSE;
if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
{ ADD_ACTIVE(state_offset + 1, 0); }
@ -953,7 +988,8 @@ for (;;)
break;
case PT_LAMP:
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
OK = chartype == ucp_Lu || chartype == ucp_Ll ||
chartype == ucp_Lt;
break;
case PT_GC:
@ -968,6 +1004,30 @@ for (;;)
OK = UCD_SCRIPT(c) == code[2];
break;
/* These are specials for combination cases. */
case PT_ALNUM:
OK = _pcre_ucp_gentype[chartype] == ucp_L ||
_pcre_ucp_gentype[chartype] == ucp_N;
break;
case PT_SPACE: /* Perl space */
OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
break;
case PT_PXSPACE: /* POSIX space */
OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
c == CHAR_FF || c == CHAR_CR;
break;
case PT_WORD:
OK = _pcre_ucp_gentype[chartype] == ucp_L ||
_pcre_ucp_gentype[chartype] == ucp_N ||
c == CHAR_UNDERSCORE;
break;
/* Should never occur, but keep compilers from grumbling. */
default:
@ -1122,7 +1182,8 @@ for (;;)
break;
case PT_LAMP:
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
OK = chartype == ucp_Lu || chartype == ucp_Ll ||
chartype == ucp_Lt;
break;
case PT_GC:
@ -1137,6 +1198,30 @@ for (;;)
OK = UCD_SCRIPT(c) == code[3];
break;
/* These are specials for combination cases. */
case PT_ALNUM:
OK = _pcre_ucp_gentype[chartype] == ucp_L ||
_pcre_ucp_gentype[chartype] == ucp_N;
break;
case PT_SPACE: /* Perl space */
OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
break;
case PT_PXSPACE: /* POSIX space */
OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
c == CHAR_FF || c == CHAR_CR;
break;
case PT_WORD:
OK = _pcre_ucp_gentype[chartype] == ucp_L ||
_pcre_ucp_gentype[chartype] == ucp_N ||
c == CHAR_UNDERSCORE;
break;
/* Should never occur, but keep compilers from grumbling. */
default:
@ -1344,7 +1429,8 @@ for (;;)
break;
case PT_LAMP:
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
OK = chartype == ucp_Lu || chartype == ucp_Ll ||
chartype == ucp_Lt;
break;
case PT_GC:
@ -1359,6 +1445,30 @@ for (;;)
OK = UCD_SCRIPT(c) == code[3];
break;
/* These are specials for combination cases. */
case PT_ALNUM:
OK = _pcre_ucp_gentype[chartype] == ucp_L ||
_pcre_ucp_gentype[chartype] == ucp_N;
break;
case PT_SPACE: /* Perl space */
OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
break;
case PT_PXSPACE: /* POSIX space */
OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
c == CHAR_FF || c == CHAR_CR;
break;
case PT_WORD:
OK = _pcre_ucp_gentype[chartype] == ucp_L ||
_pcre_ucp_gentype[chartype] == ucp_N ||
c == CHAR_UNDERSCORE;
break;
/* Should never occur, but keep compilers from grumbling. */
default:
@ -1591,7 +1701,8 @@ for (;;)
break;
case PT_LAMP:
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
OK = chartype == ucp_Lu || chartype == ucp_Ll ||
chartype == ucp_Lt;
break;
case PT_GC:
@ -1606,6 +1717,30 @@ for (;;)
OK = UCD_SCRIPT(c) == code[5];
break;
/* These are specials for combination cases. */
case PT_ALNUM:
OK = _pcre_ucp_gentype[chartype] == ucp_L ||
_pcre_ucp_gentype[chartype] == ucp_N;
break;
case PT_SPACE: /* Perl space */
OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
break;
case PT_PXSPACE: /* POSIX space */
OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
c == CHAR_FF || c == CHAR_CR;
break;
case PT_WORD:
OK = _pcre_ucp_gentype[chartype] == ucp_L ||
_pcre_ucp_gentype[chartype] == ucp_N ||
c == CHAR_UNDERSCORE;
break;
/* Should never occur, but keep compilers from grumbling. */
default:
@ -2233,7 +2368,7 @@ for (;;)
points to the byte after the end of the class. If there is a
quantifier, this is where it will be. */
next_state_offset = ecode - start_code;
next_state_offset = (int)(ecode - start_code);
switch (*ecode)
{
@ -2304,7 +2439,7 @@ for (;;)
md, /* static match data */
code, /* this subexpression's code */
ptr, /* where we currently are */
ptr - start_subject, /* start offset */
(int)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
@ -2315,7 +2450,7 @@ for (;;)
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
{ ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
{ ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
}
break;
@ -2342,9 +2477,9 @@ for (;;)
cb.callout_number = code[LINK_SIZE+2];
cb.offset_vector = offsets;
cb.subject = (PCRE_SPTR)start_subject;
cb.subject_length = end_subject - start_subject;
cb.start_match = current_subject - start_subject;
cb.current_position = ptr - start_subject;
cb.subject_length = (int)(end_subject - start_subject);
cb.start_match = (int)(current_subject - start_subject);
cb.current_position = (int)(ptr - start_subject);
cb.pattern_position = GET(code, LINK_SIZE + 3);
cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
cb.capture_top = 1;
@ -2395,7 +2530,7 @@ for (;;)
md, /* fixed match data */
asscode, /* this subexpression's code */
ptr, /* where we currently are */
ptr - start_subject, /* start offset */
(int)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
@ -2407,7 +2542,7 @@ for (;;)
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
if ((rc >= 0) ==
(condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
{ ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
{ ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
else
{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
}
@ -2428,7 +2563,7 @@ for (;;)
md, /* fixed match data */
start_code + GET(code, 1), /* this subexpression's code */
ptr, /* where we currently are */
ptr - start_subject, /* start offset */
(int)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
@ -2480,7 +2615,7 @@ for (;;)
md, /* fixed match data */
code, /* this subexpression's code */
ptr, /* where we currently are */
ptr - start_subject, /* start offset */
(int)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
sizeof(local_offsets)/sizeof(int), /* size of same */
local_workspace, /* workspace vector */
@ -2497,7 +2632,8 @@ for (;;)
do { end_subpattern += GET(end_subpattern, 1); }
while (*end_subpattern == OP_ALT);
next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
next_state_offset =
(int)(end_subpattern - start_code + LINK_SIZE + 1);
/* If the end of this subpattern is KETRMAX or KETRMIN, we must
arrange for the repeat state also to be added to the relevant list.
@ -2505,7 +2641,7 @@ for (;;)
repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
*end_subpattern == OP_KETRMIN)?
end_subpattern - start_code - GET(end_subpattern, 1) : -1;
(int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
/* If we have matched an empty string, add the next state at the
current character pointer. This is important so that the duplicate
@ -2569,9 +2705,9 @@ for (;;)
cb.callout_number = code[1];
cb.offset_vector = offsets;
cb.subject = (PCRE_SPTR)start_subject;
cb.subject_length = end_subject - start_subject;
cb.start_match = current_subject - start_subject;
cb.current_position = ptr - start_subject;
cb.subject_length = (int)(end_subject - start_subject);
cb.start_match = (int)(current_subject - start_subject);
cb.current_position = (int)(ptr - start_subject);
cb.pattern_position = GET(code, 2);
cb.next_item_length = GET(code, 2 + LINK_SIZE);
cb.capture_top = 1;
@ -2617,13 +2753,13 @@ for (;;)
((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
match_count < 0) /* no matches */
) && /* And... */
ptr >= end_subject && /* Reached end of subject */
ptr > current_subject) /* Matched non-empty string */
ptr >= end_subject && /* Reached end of subject */
ptr > md->start_used_ptr) /* Inspected non-empty string */
{
if (offsetcount >= 2)
{
offsets[0] = md->start_used_ptr - start_subject;
offsets[1] = end_subject - start_subject;
offsets[0] = (int)(md->start_used_ptr - start_subject);
offsets[1] = (int)(end_subject - start_subject);
}
match_count = PCRE_ERROR_PARTIAL;
}
@ -2708,6 +2844,7 @@ if (re == NULL || subject == NULL || workspace == NULL ||
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
/* We need to find the pointer to any study data before we test for byte
flipping, so we scan the extra_data block first. This may set two fields in the
@ -2826,16 +2963,14 @@ back the character offset. */
#ifdef SUPPORT_UTF8
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
{
if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
return PCRE_ERROR_BADUTF8;
int tb;
if ((tb = _pcre_valid_utf8((uschar *)subject, length)) >= 0)
return (tb == length && (options & PCRE_PARTIAL_HARD) != 0)?
PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
if (start_offset > 0 && start_offset < length)
{
int tb = ((uschar *)subject)[start_offset];
if (tb > 127)
{
tb &= 0xc0;
if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
}
tb = ((USPTR)subject)[start_offset] & 0xc0;
if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
}
}
#endif
@ -2922,9 +3057,11 @@ for (;;)
/* There are some optimizations that avoid running the match if a known
starting point is not found. However, there is an option that disables
these, for testing and for ensuring that all callouts do actually occur. */
these, for testing and for ensuring that all callouts do actually occur.
The option can be set in the regex by (*NO_START_OPT) or passed in
match-time options. */
if ((options & PCRE_NO_START_OPTIMIZE) == 0)
if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
{
/* Advance to a known first byte. */
@ -2982,8 +3119,16 @@ for (;;)
while (current_subject < end_subject)
{
register unsigned int c = *current_subject;
if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
else break;
if ((start_bits[c/8] & (1 << (c&7))) == 0)
{
current_subject++;
#ifdef SUPPORT_UTF8
if (utf8)
while(current_subject < end_subject &&
(*current_subject & 0xc0) == 0x80) current_subject++;
#endif
}
else break;
}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -408,9 +408,10 @@ capturing parenthesis numbers in back references. */
/* When UTF-8 encoding is being used, a character is no longer just a single
byte. The macros for character handling generate simple sequences when used in
byte-mode, and more complicated ones for UTF-8 characters. BACKCHAR should
never be called in byte mode. To make sure it can never even appear when UTF-8
support is omitted, we don't even define it. */
byte-mode, and more complicated ones for UTF-8 characters. GETCHARLENTEST is
not used when UTF-8 is not supported, so it is not defined, and BACKCHAR should
never be called in byte mode. To make sure they can never even appear when
UTF-8 support is omitted, we don't even define them. */
#ifndef SUPPORT_UTF8
#define GETCHAR(c, eptr) c = *eptr;
@ -418,43 +419,83 @@ support is omitted, we don't even define it. */
#define GETCHARINC(c, eptr) c = *eptr++;
#define GETCHARINCTEST(c, eptr) c = *eptr++;
#define GETCHARLEN(c, eptr, len) c = *eptr;
/* #define GETCHARLENTEST(c, eptr, len) */
/* #define BACKCHAR(eptr) */
#else /* SUPPORT_UTF8 */
/* These macros were originally written in the form of loops that used data
from the tables whose names start with _pcre_utf8_table. They were rewritten by
a user so as not to use loops, because in some environments this gives a
significant performance advantage, and it seems never to do any harm. */
/* Base macro to pick up the remaining bytes of a UTF-8 character, not
advancing the pointer. */
#define GETUTF8(c, eptr) \
{ \
if ((c & 0x20) == 0) \
c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
else if ((c & 0x10) == 0) \
c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
else if ((c & 0x08) == 0) \
c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
else if ((c & 0x04) == 0) \
c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
(eptr[4] & 0x3f); \
else \
c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
}
/* Get the next UTF-8 character, not advancing the pointer. This is called when
we know we are in UTF-8 mode. */
#define GETCHAR(c, eptr) \
c = *eptr; \
if (c >= 0xc0) \
{ \
int gcii; \
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
int gcss = 6*gcaa; \
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
for (gcii = 1; gcii <= gcaa; gcii++) \
{ \
gcss -= 6; \
c |= (eptr[gcii] & 0x3f) << gcss; \
} \
}
if (c >= 0xc0) GETUTF8(c, eptr);
/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
pointer. */
#define GETCHARTEST(c, eptr) \
c = *eptr; \
if (utf8 && c >= 0xc0) \
if (utf8 && c >= 0xc0) GETUTF8(c, eptr);
/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing
the pointer. */
#define GETUTF8INC(c, eptr) \
{ \
int gcii; \
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
int gcss = 6*gcaa; \
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
for (gcii = 1; gcii <= gcaa; gcii++) \
if ((c & 0x20) == 0) \
c = ((c & 0x1f) << 6) | (*eptr++ & 0x3f); \
else if ((c & 0x10) == 0) \
{ \
gcss -= 6; \
c |= (eptr[gcii] & 0x3f) << gcss; \
c = ((c & 0x0f) << 12) | ((*eptr & 0x3f) << 6) | (eptr[1] & 0x3f); \
eptr += 2; \
} \
else if ((c & 0x08) == 0) \
{ \
c = ((c & 0x07) << 18) | ((*eptr & 0x3f) << 12) | \
((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
eptr += 3; \
} \
else if ((c & 0x04) == 0) \
{ \
c = ((c & 0x03) << 24) | ((*eptr & 0x3f) << 18) | \
((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) << 6) | \
(eptr[3] & 0x3f); \
eptr += 4; \
} \
else \
{ \
c = ((c & 0x01) << 30) | ((*eptr & 0x3f) << 24) | \
((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) << 12) | \
((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
eptr += 5; \
} \
}
@ -463,31 +504,49 @@ know we are in UTF-8 mode. */
#define GETCHARINC(c, eptr) \
c = *eptr++; \
if (c >= 0xc0) \
{ \
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
int gcss = 6*gcaa; \
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
while (gcaa-- > 0) \
{ \
gcss -= 6; \
c |= (*eptr++ & 0x3f) << gcss; \
} \
}
if (c >= 0xc0) GETUTF8INC(c, eptr);
/* Get the next character, testing for UTF-8 mode, and advancing the pointer */
/* Get the next character, testing for UTF-8 mode, and advancing the pointer.
This is called when we don't know if we are in UTF-8 mode. */
#define GETCHARINCTEST(c, eptr) \
c = *eptr++; \
if (utf8 && c >= 0xc0) \
if (utf8 && c >= 0xc0) GETUTF8INC(c, eptr);
/* Base macro to pick up the remaining bytes of a UTF-8 character, not
advancing the pointer, incrementing the length. */
#define GETUTF8LEN(c, eptr, len) \
{ \
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
int gcss = 6*gcaa; \
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
while (gcaa-- > 0) \
if ((c & 0x20) == 0) \
{ \
gcss -= 6; \
c |= (*eptr++ & 0x3f) << gcss; \
c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
len++; \
} \
else if ((c & 0x10) == 0) \
{ \
c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
len += 2; \
} \
else if ((c & 0x08) == 0) \
{\
c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
len += 3; \
} \
else if ((c & 0x04) == 0) \
{ \
c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
(eptr[4] & 0x3f); \
len += 4; \
} \
else \
{\
c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
len += 5; \
} \
}
@ -496,39 +555,15 @@ if there are extra bytes. This is called when we know we are in UTF-8 mode. */
#define GETCHARLEN(c, eptr, len) \
c = *eptr; \
if (c >= 0xc0) \
{ \
int gcii; \
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
int gcss = 6*gcaa; \
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
for (gcii = 1; gcii <= gcaa; gcii++) \
{ \
gcss -= 6; \
c |= (eptr[gcii] & 0x3f) << gcss; \
} \
len += gcaa; \
}
if (c >= 0xc0) GETUTF8LEN(c, eptr, len);
/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
pointer, incrementing length if there are extra bytes. This is called when we
know we are in UTF-8 mode. */
do not know if we are in UTF-8 mode. */
#define GETCHARLENTEST(c, eptr, len) \
c = *eptr; \
if (utf8 && c >= 0xc0) \
{ \
int gcii; \
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
int gcss = 6*gcaa; \
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
for (gcii = 1; gcii <= gcaa; gcii++) \
{ \
gcss -= 6; \
c |= (eptr[gcii] & 0x3f) << gcss; \
} \
len += gcaa; \
}
if (utf8 && c >= 0xc0) GETUTF8LEN(c, eptr, len);
/* If the pointer is not at the start of a character, move it back until
it is. This is called only in UTF-8 mode - we don't put a test within the macro
@ -536,7 +571,7 @@ because almost all calls are already within a block of UTF-8 only code. */
#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--
#endif
#endif /* SUPPORT_UTF8 */
/* In case there is no definition of offsetof() provided - though any proper
@ -580,7 +615,7 @@ time, run time, or study time, respectively. */
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
PCRE_JAVASCRIPT_COMPAT)
PCRE_JAVASCRIPT_COMPAT|PCRE_UCP|PCRE_NO_START_OPTIMIZE)
#define PUBLIC_EXEC_OPTIONS \
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
@ -620,7 +655,7 @@ variable-length repeat, or a anything other than literal characters. */
environments where these macros are defined elsewhere. Unfortunately, there
is no way to do the same for the typedef. */
typedef gboolean BOOL;
typedef gboolean BOOL;
/* If PCRE is to support UTF-8 on EBCDIC platforms, we cannot use normal
character constants like '*' because the compiler would emit their EBCDIC code,
@ -870,6 +905,7 @@ so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
#define STRING_COMMIT0 "COMMIT\0"
#define STRING_F0 "F\0"
#define STRING_FAIL0 "FAIL\0"
#define STRING_MARK0 "MARK\0"
#define STRING_PRUNE0 "PRUNE\0"
#define STRING_SKIP0 "SKIP\0"
#define STRING_THEN "THEN"
@ -891,14 +927,16 @@ so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
#define STRING_DEFINE "DEFINE"
#define STRING_CR_RIGHTPAR "CR)"
#define STRING_LF_RIGHTPAR "LF)"
#define STRING_CRLF_RIGHTPAR "CRLF)"
#define STRING_ANY_RIGHTPAR "ANY)"
#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)"
#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
#define STRING_UTF8_RIGHTPAR "UTF8)"
#define STRING_CR_RIGHTPAR "CR)"
#define STRING_LF_RIGHTPAR "LF)"
#define STRING_CRLF_RIGHTPAR "CRLF)"
#define STRING_ANY_RIGHTPAR "ANY)"
#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)"
#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
#define STRING_UTF8_RIGHTPAR "UTF8)"
#define STRING_UCP_RIGHTPAR "UCP)"
#define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)"
#else /* SUPPORT_UTF8 */
@ -1122,6 +1160,7 @@ only. */
#define STRING_COMMIT0 STR_C STR_O STR_M STR_M STR_I STR_T "\0"
#define STRING_F0 STR_F "\0"
#define STRING_FAIL0 STR_F STR_A STR_I STR_L "\0"
#define STRING_MARK0 STR_M STR_A STR_R STR_K "\0"
#define STRING_PRUNE0 STR_P STR_R STR_U STR_N STR_E "\0"
#define STRING_SKIP0 STR_S STR_K STR_I STR_P "\0"
#define STRING_THEN STR_T STR_H STR_E STR_N
@ -1143,14 +1182,16 @@ only. */
#define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E
#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
#define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS
#define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS
#endif /* SUPPORT_UTF8 */
@ -1183,9 +1224,13 @@ only. */
#define PT_ANY 0 /* Any property - matches all chars */
#define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */
#define PT_GC 2 /* General characteristic (e.g. L) */
#define PT_PC 3 /* Particular characteristic (e.g. Lu) */
#define PT_GC 2 /* Specified general characteristic (e.g. L) */
#define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */
#define PT_SC 4 /* Script (e.g. Han) */
#define PT_ALNUM 5 /* Alphanumeric - the union of L and N */
#define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */
#define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */
#define PT_WORD 8 /* Word - L plus N plus underscore */
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
contain UTF-8 characters with values greater than 255. */
@ -1202,9 +1247,15 @@ contain UTF-8 characters with values greater than 255. */
/* These are escaped items that aren't just an encoding of a particular data
value such as \n. They must have non-zero values, as check_escape() returns
their negation. Also, they must appear in the same order as in the opcode
definitions below, up to ESC_z. There's a dummy for OP_ANY because it
corresponds to "." rather than an escape sequence, and another for OP_ALLANY
(which is used for [^] in JavaScript compatibility mode).
definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
corresponds to "." in DOTALL mode rather than an escape sequence. It is also
used for [^] in JavaScript compatibility mode. In non-DOTALL mode, "." behaves
like \N.
The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
when PCRE_UCP is set, when replacement of \d etc by \p sequences is required.
They must be contiguous, and remain in order so that the replacements can be
looked up from a table.
The final escape must be ESC_REF as subsequent values are used for
backreferences (\1, \2, \3, etc). There are two tests in the code for an escape
@ -1214,11 +1265,12 @@ put in between that don't consume a character, that code will have to change.
*/
enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k,
ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z,
ESC_E, ESC_Q, ESC_g, ESC_k,
ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu,
ESC_REF };
/* Opcode table: Starting from 1 (i.e. after OP_END), the values up to
OP_EOD must correspond in order to the list of escapes immediately above.
@ -1242,8 +1294,8 @@ enum {
OP_WHITESPACE, /* 9 \s */
OP_NOT_WORDCHAR, /* 10 \W */
OP_WORDCHAR, /* 11 \w */
OP_ANY, /* 12 Match any character (subject to DOTALL) */
OP_ALLANY, /* 13 Match any character (not subject to DOTALL) */
OP_ANY, /* 12 Match any character except newline */
OP_ALLANY, /* 13 Match any character */
OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
OP_NOTPROP, /* 15 \P (not Unicode property) */
OP_PROP, /* 16 \p (Unicode property) */
@ -1373,20 +1425,24 @@ enum {
/* These are backtracking control verbs */
OP_PRUNE, /* 107 */
OP_SKIP, /* 108 */
OP_THEN, /* 109 */
OP_COMMIT, /* 110 */
OP_MARK, /* 107 always has an argument */
OP_PRUNE, /* 108 */
OP_PRUNE_ARG, /* 109 same, but with argument */
OP_SKIP, /* 110 */
OP_SKIP_ARG, /* 111 same, but with argument */
OP_THEN, /* 112 */
OP_THEN_ARG, /* 113 same, but with argument */
OP_COMMIT, /* 114 */
/* These are forced failure and success verbs */
OP_FAIL, /* 111 */
OP_ACCEPT, /* 112 */
OP_CLOSE, /* 113 Used before OP_ACCEPT to close open captures */
OP_FAIL, /* 115 */
OP_ACCEPT, /* 116 */
OP_CLOSE, /* 117 Used before OP_ACCEPT to close open captures */
/* This is used to skip a subpattern with a {0} quantifier */
OP_SKIPZERO, /* 114 */
OP_SKIPZERO, /* 118 */
/* This is not an opcode, but is used to check that tables indexed by opcode
are the correct length, in order to catch updating errors - there have been
@ -1397,7 +1453,7 @@ enum {
/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
definitions that follow must also be updated to match. There are also tables
called "coptable" cna "poptable" in pcre_dfa_exec.c that must be updated. */
called "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */
/* This macro defines textual names for all the opcodes. These are used only
@ -1422,7 +1478,8 @@ for debugging. The macro is referenced only in pcre_printint.c. */
"Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
"Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def", \
"Brazero", "Braminzero", \
"*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
"*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
"*THEN", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
"Close", "Skip zero"
@ -1488,8 +1545,10 @@ in UTF-8 mode. The code that uses this table must know about such things. */
3, 3, /* RREF, NRREF */ \
1, /* DEF */ \
1, 1, /* BRAZERO, BRAMINZERO */ \
1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \
1, 1, 3, 1 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \
1, 3, /* SKIP, SKIP_ARG */ \
1+LINK_SIZE, 3+LINK_SIZE, /* THEN, THEN_ARG */ \
1, 1, 1, 3, 1 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
/* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion"
@ -1507,7 +1566,8 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERRCOUNT };
ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68,
ERRCOUNT };
/* The real format of the start of the pcre block; the index of names and the
code vector run on as long as necessary after the end. We store an explicit
@ -1650,6 +1710,7 @@ typedef struct match_data {
BOOL noteol; /* NOTEOL flag */
BOOL utf8; /* UTF8 flag */
BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */
BOOL use_ucp; /* PCRE_UCP flag */
BOOL endonly; /* Dollar not before final \n */
BOOL notempty; /* Empty string match not wanted */
BOOL notempty_atstart; /* Empty string match at start not wanted */
@ -1669,6 +1730,7 @@ typedef struct match_data {
int eptrn; /* Next free eptrblock */
recursion_info *recursive; /* Linked list of recursion data */
void *callout_data; /* To pass back to callouts */
const uschar *mark; /* Mark pointer to pass back */
} match_data;
/* A similar structure is used for the same purpose by the DFA matching
@ -1764,7 +1826,7 @@ extern BOOL _pcre_is_newline(USPTR, int, USPTR, int *, BOOL);
extern int _pcre_ord2utf8(int, uschar *);
extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
const pcre_study_data *, pcre_study_data *);
#define _pcre_valid_utf8(u, i) TRUE
#define _pcre_valid_utf8(USPTR, int) TRUE
extern BOOL _pcre_was_newline(USPTR, int, USPTR, int *, BOOL);
extern BOOL _pcre_xclass(int, const uschar *);

View File

@ -48,6 +48,7 @@ supporting functions. */
#include "pcre_internal.h"
#define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
/* Returns from set_start_bits() */
@ -413,6 +414,18 @@ for (;;)
#endif
break;
/* Skip these, but we need to add in the name length. */
case OP_MARK:
case OP_PRUNE_ARG:
case OP_SKIP_ARG:
cc += _pcre_OP_lengths[op] + cc[1];
break;
case OP_THEN_ARG:
cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
break;
/* For the record, these are the opcodes that are matched by "default":
OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
OP_THEN. */
@ -431,25 +444,121 @@ for (;;)
* Set a bit and maybe its alternate case *
*************************************************/
/* Given a character, set its bit in the table, and also the bit for the other
version of a letter if we are caseless.
/* Given a character, set its first byte's bit in the table, and also the
corresponding bit for the other version of a letter if we are caseless. In
UTF-8 mode, for characters greater than 127, we can only do the caseless thing
when Unicode property support is available.
Arguments:
start_bits points to the bit map
c is the character
p points to the character
caseless the caseless flag
cd the block with char table pointers
utf8 TRUE for UTF-8 mode
Returns: nothing
Returns: pointer after the character
*/
static const uschar *
set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
compile_data *cd, BOOL utf8)
{
unsigned int c = *p;
SET_BIT(c);
#ifdef SUPPORT_UTF8
if (utf8 && c > 127)
{
GETCHARINC(c, p);
#ifdef SUPPORT_UCP
if (caseless)
{
uschar buff[8];
c = UCD_OTHERCASE(c);
(void)_pcre_ord2utf8(c, buff);
SET_BIT(buff[0]);
}
#endif
return p;
}
#endif
/* Not UTF-8 mode, or character is less than 127. */
if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
return p + 1;
}
/*************************************************
* Set bits for a positive character type *
*************************************************/
/* This function sets starting bits for a character type. In UTF-8 mode, we can
only do a direct setting for bytes less than 128, as otherwise there can be
confusion with bytes in the middle of UTF-8 characters. In a "traditional"
environment, the tables will only recognize ASCII characters anyway, but in at
least one Windows environment, some higher bytes bits were set in the tables.
So we deal with that case by considering the UTF-8 encoding.
Arguments:
start_bits the starting bitmap
cbit type the type of character wanted
table_limit 32 for non-UTF-8; 16 for UTF-8
cd the block with char table pointers
Returns: nothing
*/
static void
set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless,
set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
compile_data *cd)
{
start_bits[c/8] |= (1 << (c&7));
if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
register int c;
for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
if (table_limit == 32) return;
for (c = 128; c < 256; c++)
{
if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
{
uschar buff[8];
(void)_pcre_ord2utf8(c, buff);
SET_BIT(buff[0]);
}
}
}
/*************************************************
* Set bits for a negative character type *
*************************************************/
/* This function sets starting bits for a negative character type such as \D.
In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
otherwise there can be confusion with bytes in the middle of UTF-8 characters.
Unlike in the positive case, where we can set appropriate starting bits for
specific high-valued UTF-8 characters, in this case we have to set the bits for
all high-valued characters. The lowest is 0xc2, but we overkill by starting at
0xc0 (192) for simplicity.
Arguments:
start_bits the starting bitmap
cbit type the type of character wanted
table_limit 32 for non-UTF-8; 16 for UTF-8
cd the block with char table pointers
Returns: nothing
*/
static void
set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
compile_data *cd)
{
register int c;
for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
}
@ -484,6 +593,7 @@ set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
{
register int c;
int yield = SSB_DONE;
int table_limit = utf8? 16:32;
#if 0
/* ========================================================================= */
@ -607,12 +717,7 @@ do
case OP_QUERY:
case OP_MINQUERY:
case OP_POSQUERY:
set_table_bit(start_bits, tcode[1], caseless, cd);
tcode += 2;
#ifdef SUPPORT_UTF8
if (utf8 && tcode[-1] >= 0xc0)
tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
#endif
tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
break;
/* Single-char upto sets the bit and tries the next */
@ -620,12 +725,7 @@ do
case OP_UPTO:
case OP_MINUPTO:
case OP_POSUPTO:
set_table_bit(start_bits, tcode[3], caseless, cd);
tcode += 4;
#ifdef SUPPORT_UTF8
if (utf8 && tcode[-1] >= 0xc0)
tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
#endif
tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
break;
/* At least one single char sets the bit and stops */
@ -638,59 +738,86 @@ do
case OP_PLUS:
case OP_MINPLUS:
case OP_POSPLUS:
set_table_bit(start_bits, tcode[1], caseless, cd);
(void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
try_next = FALSE;
break;
/* Single character type sets the bits and stops */
/* Special spacing and line-terminating items. These recognize specific
lists of characters. The difference between VSPACE and ANYNL is that the
latter can match the two-character CRLF sequence, but that is not
relevant for finding the first character, so their code here is
identical. */
case OP_HSPACE:
SET_BIT(0x09);
SET_BIT(0x20);
if (utf8)
{
SET_BIT(0xC2); /* For U+00A0 */
SET_BIT(0xE1); /* For U+1680, U+180E */
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
SET_BIT(0xE3); /* For U+3000 */
}
else SET_BIT(0xA0);
try_next = FALSE;
break;
case OP_ANYNL:
case OP_VSPACE:
SET_BIT(0x0A);
SET_BIT(0x0B);
SET_BIT(0x0C);
SET_BIT(0x0D);
if (utf8)
{
SET_BIT(0xC2); /* For U+0085 */
SET_BIT(0xE2); /* For U+2028, U+2029 */
}
else SET_BIT(0x85);
try_next = FALSE;
break;
/* Single character types set the bits and stop. Note that if PCRE_UCP
is set, we do not see these op codes because \d etc are converted to
properties. Therefore, these apply in the case when only characters less
than 256 are recognized to match the types. */
case OP_NOT_DIGIT:
for (c = 0; c < 32; c++)
start_bits[c] |= ~cd->cbits[c+cbit_digit];
set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
try_next = FALSE;
break;
case OP_DIGIT:
for (c = 0; c < 32; c++)
start_bits[c] |= cd->cbits[c+cbit_digit];
set_type_bits(start_bits, cbit_digit, table_limit, cd);
try_next = FALSE;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
discard it. */
ensure it is set as not whitespace. */
case OP_NOT_WHITESPACE:
for (c = 0; c < 32; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
start_bits[c] |= ~d;
}
set_nottype_bits(start_bits, cbit_space, table_limit, cd);
start_bits[1] |= 0x08;
try_next = FALSE;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
discard it. */
not set it from the table. */
case OP_WHITESPACE:
for (c = 0; c < 32; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
start_bits[c] |= d;
}
c = start_bits[1]; /* Save in case it was already set */
set_type_bits(start_bits, cbit_space, table_limit, cd);
start_bits[1] = (start_bits[1] & ~0x08) | c;
try_next = FALSE;
break;
case OP_NOT_WORDCHAR:
for (c = 0; c < 32; c++)
start_bits[c] |= ~cd->cbits[c+cbit_word];
set_nottype_bits(start_bits, cbit_word, table_limit, cd);
try_next = FALSE;
break;
case OP_WORDCHAR:
for (c = 0; c < 32; c++)
start_bits[c] |= cd->cbits[c+cbit_word];
set_type_bits(start_bits, cbit_word, table_limit, cd);
try_next = FALSE;
break;
@ -699,6 +826,7 @@ do
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
case OP_TYPEPOSPLUS:
tcode++;
break;
@ -722,52 +850,69 @@ do
case OP_TYPEPOSQUERY:
switch(tcode[1])
{
default:
case OP_ANY:
case OP_ALLANY:
return SSB_FAIL;
case OP_HSPACE:
SET_BIT(0x09);
SET_BIT(0x20);
if (utf8)
{
SET_BIT(0xC2); /* For U+00A0 */
SET_BIT(0xE1); /* For U+1680, U+180E */
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
SET_BIT(0xE3); /* For U+3000 */
}
else SET_BIT(0xA0);
break;
case OP_ANYNL:
case OP_VSPACE:
SET_BIT(0x0A);
SET_BIT(0x0B);
SET_BIT(0x0C);
SET_BIT(0x0D);
if (utf8)
{
SET_BIT(0xC2); /* For U+0085 */
SET_BIT(0xE2); /* For U+2028, U+2029 */
}
else SET_BIT(0x85);
break;
case OP_NOT_DIGIT:
for (c = 0; c < 32; c++)
start_bits[c] |= ~cd->cbits[c+cbit_digit];
set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
break;
case OP_DIGIT:
for (c = 0; c < 32; c++)
start_bits[c] |= cd->cbits[c+cbit_digit];
set_type_bits(start_bits, cbit_digit, table_limit, cd);
break;
/* The cbit_space table has vertical tab as whitespace; we have to
discard it. */
ensure it gets set as not whitespace. */
case OP_NOT_WHITESPACE:
for (c = 0; c < 32; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
start_bits[c] |= ~d;
}
set_nottype_bits(start_bits, cbit_space, table_limit, cd);
start_bits[1] |= 0x08;
break;
/* The cbit_space table has vertical tab as whitespace; we have to
discard it. */
avoid setting it. */
case OP_WHITESPACE:
for (c = 0; c < 32; c++)
{
int d = cd->cbits[c+cbit_space];
if (c == 1) d &= ~0x08;
start_bits[c] |= d;
}
c = start_bits[1]; /* Save in case it was already set */
set_type_bits(start_bits, cbit_space, table_limit, cd);
start_bits[1] = (start_bits[1] & ~0x08) | c;
break;
case OP_NOT_WORDCHAR:
for (c = 0; c < 32; c++)
start_bits[c] |= ~cd->cbits[c+cbit_word];
set_nottype_bits(start_bits, cbit_word, table_limit, cd);
break;
case OP_WORDCHAR:
for (c = 0; c < 32; c++)
start_bits[c] |= cd->cbits[c+cbit_word];
set_type_bits(start_bits, cbit_word, table_limit, cd);
break;
}

View File

@ -123,8 +123,10 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Avestan0 STR_A STR_v STR_e STR_s STR_t STR_a STR_n "\0"
#define STRING_Balinese0 STR_B STR_a STR_l STR_i STR_n STR_e STR_s STR_e "\0"
#define STRING_Bamum0 STR_B STR_a STR_m STR_u STR_m "\0"
#define STRING_Batak0 STR_B STR_a STR_t STR_a STR_k "\0"
#define STRING_Bengali0 STR_B STR_e STR_n STR_g STR_a STR_l STR_i "\0"
#define STRING_Bopomofo0 STR_B STR_o STR_p STR_o STR_m STR_o STR_f STR_o "\0"
#define STRING_Brahmi0 STR_B STR_r STR_a STR_h STR_m STR_i "\0"
#define STRING_Braille0 STR_B STR_r STR_a STR_i STR_l STR_l STR_e "\0"
#define STRING_Buginese0 STR_B STR_u STR_g STR_i STR_n STR_e STR_s STR_e "\0"
#define STRING_Buhid0 STR_B STR_u STR_h STR_i STR_d "\0"
@ -184,6 +186,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Lu0 STR_L STR_u "\0"
#define STRING_Lycian0 STR_L STR_y STR_c STR_i STR_a STR_n "\0"
#define STRING_Lydian0 STR_L STR_y STR_d STR_i STR_a STR_n "\0"
#define STRING_Mandaic0 STR_M STR_a STR_n STR_d STR_a STR_i STR_c "\0"
#define STRING_M0 STR_M "\0"
#define STRING_Malayalam0 STR_M STR_a STR_l STR_a STR_y STR_a STR_l STR_a STR_m "\0"
#define STRING_Mc0 STR_M STR_c "\0"
@ -243,6 +246,10 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
#define STRING_Tifinagh0 STR_T STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0"
#define STRING_Ugaritic0 STR_U STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0"
#define STRING_Vai0 STR_V STR_a STR_i "\0"
#define STRING_Xan0 STR_X STR_a STR_n "\0"
#define STRING_Xps0 STR_X STR_p STR_s "\0"
#define STRING_Xsp0 STR_X STR_s STR_p "\0"
#define STRING_Xwd0 STR_X STR_w STR_d "\0"
#define STRING_Yi0 STR_Y STR_i "\0"
#define STRING_Z0 STR_Z "\0"
#define STRING_Zl0 STR_Z STR_l "\0"
@ -256,8 +263,10 @@ const char _pcre_utt_names[] =
STRING_Avestan0
STRING_Balinese0
STRING_Bamum0
STRING_Batak0
STRING_Bengali0
STRING_Bopomofo0
STRING_Brahmi0
STRING_Braille0
STRING_Buginese0
STRING_Buhid0
@ -319,6 +328,7 @@ const char _pcre_utt_names[] =
STRING_Lydian0
STRING_M0
STRING_Malayalam0
STRING_Mandaic0
STRING_Mc0
STRING_Me0
STRING_Meetei_Mayek0
@ -376,6 +386,10 @@ const char _pcre_utt_names[] =
STRING_Tifinagh0
STRING_Ugaritic0
STRING_Vai0
STRING_Xan0
STRING_Xps0
STRING_Xsp0
STRING_Xwd0
STRING_Yi0
STRING_Z0
STRING_Zl0
@ -389,131 +403,138 @@ const ucp_type_table _pcre_utt[] = {
{ 20, PT_SC, ucp_Avestan },
{ 28, PT_SC, ucp_Balinese },
{ 37, PT_SC, ucp_Bamum },
{ 43, PT_SC, ucp_Bengali },
{ 51, PT_SC, ucp_Bopomofo },
{ 60, PT_SC, ucp_Braille },
{ 68, PT_SC, ucp_Buginese },
{ 77, PT_SC, ucp_Buhid },
{ 83, PT_GC, ucp_C },
{ 85, PT_SC, ucp_Canadian_Aboriginal },
{ 105, PT_SC, ucp_Carian },
{ 112, PT_PC, ucp_Cc },
{ 115, PT_PC, ucp_Cf },
{ 118, PT_SC, ucp_Cham },
{ 123, PT_SC, ucp_Cherokee },
{ 132, PT_PC, ucp_Cn },
{ 135, PT_PC, ucp_Co },
{ 138, PT_SC, ucp_Common },
{ 145, PT_SC, ucp_Coptic },
{ 152, PT_PC, ucp_Cs },
{ 155, PT_SC, ucp_Cuneiform },
{ 165, PT_SC, ucp_Cypriot },
{ 173, PT_SC, ucp_Cyrillic },
{ 182, PT_SC, ucp_Deseret },
{ 190, PT_SC, ucp_Devanagari },
{ 201, PT_SC, ucp_Egyptian_Hieroglyphs },
{ 222, PT_SC, ucp_Ethiopic },
{ 231, PT_SC, ucp_Georgian },
{ 240, PT_SC, ucp_Glagolitic },
{ 251, PT_SC, ucp_Gothic },
{ 258, PT_SC, ucp_Greek },
{ 264, PT_SC, ucp_Gujarati },
{ 273, PT_SC, ucp_Gurmukhi },
{ 282, PT_SC, ucp_Han },
{ 286, PT_SC, ucp_Hangul },
{ 293, PT_SC, ucp_Hanunoo },
{ 301, PT_SC, ucp_Hebrew },
{ 308, PT_SC, ucp_Hiragana },
{ 317, PT_SC, ucp_Imperial_Aramaic },
{ 334, PT_SC, ucp_Inherited },
{ 344, PT_SC, ucp_Inscriptional_Pahlavi },
{ 366, PT_SC, ucp_Inscriptional_Parthian },
{ 389, PT_SC, ucp_Javanese },
{ 398, PT_SC, ucp_Kaithi },
{ 405, PT_SC, ucp_Kannada },
{ 413, PT_SC, ucp_Katakana },
{ 422, PT_SC, ucp_Kayah_Li },
{ 431, PT_SC, ucp_Kharoshthi },
{ 442, PT_SC, ucp_Khmer },
{ 448, PT_GC, ucp_L },
{ 450, PT_LAMP, 0 },
{ 453, PT_SC, ucp_Lao },
{ 457, PT_SC, ucp_Latin },
{ 463, PT_SC, ucp_Lepcha },
{ 470, PT_SC, ucp_Limbu },
{ 476, PT_SC, ucp_Linear_B },
{ 485, PT_SC, ucp_Lisu },
{ 490, PT_PC, ucp_Ll },
{ 493, PT_PC, ucp_Lm },
{ 496, PT_PC, ucp_Lo },
{ 499, PT_PC, ucp_Lt },
{ 502, PT_PC, ucp_Lu },
{ 505, PT_SC, ucp_Lycian },
{ 512, PT_SC, ucp_Lydian },
{ 519, PT_GC, ucp_M },
{ 521, PT_SC, ucp_Malayalam },
{ 531, PT_PC, ucp_Mc },
{ 534, PT_PC, ucp_Me },
{ 537, PT_SC, ucp_Meetei_Mayek },
{ 550, PT_PC, ucp_Mn },
{ 553, PT_SC, ucp_Mongolian },
{ 563, PT_SC, ucp_Myanmar },
{ 571, PT_GC, ucp_N },
{ 573, PT_PC, ucp_Nd },
{ 576, PT_SC, ucp_New_Tai_Lue },
{ 588, PT_SC, ucp_Nko },
{ 592, PT_PC, ucp_Nl },
{ 595, PT_PC, ucp_No },
{ 598, PT_SC, ucp_Ogham },
{ 604, PT_SC, ucp_Ol_Chiki },
{ 613, PT_SC, ucp_Old_Italic },
{ 624, PT_SC, ucp_Old_Persian },
{ 636, PT_SC, ucp_Old_South_Arabian },
{ 654, PT_SC, ucp_Old_Turkic },
{ 665, PT_SC, ucp_Oriya },
{ 671, PT_SC, ucp_Osmanya },
{ 679, PT_GC, ucp_P },
{ 681, PT_PC, ucp_Pc },
{ 684, PT_PC, ucp_Pd },
{ 687, PT_PC, ucp_Pe },
{ 690, PT_PC, ucp_Pf },
{ 693, PT_SC, ucp_Phags_Pa },
{ 702, PT_SC, ucp_Phoenician },
{ 713, PT_PC, ucp_Pi },
{ 716, PT_PC, ucp_Po },
{ 719, PT_PC, ucp_Ps },
{ 722, PT_SC, ucp_Rejang },
{ 729, PT_SC, ucp_Runic },
{ 735, PT_GC, ucp_S },
{ 737, PT_SC, ucp_Samaritan },
{ 747, PT_SC, ucp_Saurashtra },
{ 758, PT_PC, ucp_Sc },
{ 761, PT_SC, ucp_Shavian },
{ 769, PT_SC, ucp_Sinhala },
{ 777, PT_PC, ucp_Sk },
{ 780, PT_PC, ucp_Sm },
{ 783, PT_PC, ucp_So },
{ 786, PT_SC, ucp_Sundanese },
{ 796, PT_SC, ucp_Syloti_Nagri },
{ 809, PT_SC, ucp_Syriac },
{ 816, PT_SC, ucp_Tagalog },
{ 824, PT_SC, ucp_Tagbanwa },
{ 833, PT_SC, ucp_Tai_Le },
{ 840, PT_SC, ucp_Tai_Tham },
{ 849, PT_SC, ucp_Tai_Viet },
{ 858, PT_SC, ucp_Tamil },
{ 864, PT_SC, ucp_Telugu },
{ 871, PT_SC, ucp_Thaana },
{ 878, PT_SC, ucp_Thai },
{ 883, PT_SC, ucp_Tibetan },
{ 891, PT_SC, ucp_Tifinagh },
{ 900, PT_SC, ucp_Ugaritic },
{ 909, PT_SC, ucp_Vai },
{ 913, PT_SC, ucp_Yi },
{ 916, PT_GC, ucp_Z },
{ 918, PT_PC, ucp_Zl },
{ 921, PT_PC, ucp_Zp },
{ 924, PT_PC, ucp_Zs }
{ 43, PT_SC, ucp_Batak },
{ 49, PT_SC, ucp_Bengali },
{ 57, PT_SC, ucp_Bopomofo },
{ 66, PT_SC, ucp_Brahmi },
{ 73, PT_SC, ucp_Braille },
{ 81, PT_SC, ucp_Buginese },
{ 90, PT_SC, ucp_Buhid },
{ 96, PT_GC, ucp_C },
{ 98, PT_SC, ucp_Canadian_Aboriginal },
{ 118, PT_SC, ucp_Carian },
{ 125, PT_PC, ucp_Cc },
{ 128, PT_PC, ucp_Cf },
{ 131, PT_SC, ucp_Cham },
{ 136, PT_SC, ucp_Cherokee },
{ 145, PT_PC, ucp_Cn },
{ 148, PT_PC, ucp_Co },
{ 151, PT_SC, ucp_Common },
{ 158, PT_SC, ucp_Coptic },
{ 165, PT_PC, ucp_Cs },
{ 168, PT_SC, ucp_Cuneiform },
{ 178, PT_SC, ucp_Cypriot },
{ 186, PT_SC, ucp_Cyrillic },
{ 195, PT_SC, ucp_Deseret },
{ 203, PT_SC, ucp_Devanagari },
{ 214, PT_SC, ucp_Egyptian_Hieroglyphs },
{ 235, PT_SC, ucp_Ethiopic },
{ 244, PT_SC, ucp_Georgian },
{ 253, PT_SC, ucp_Glagolitic },
{ 264, PT_SC, ucp_Gothic },
{ 271, PT_SC, ucp_Greek },
{ 277, PT_SC, ucp_Gujarati },
{ 286, PT_SC, ucp_Gurmukhi },
{ 295, PT_SC, ucp_Han },
{ 299, PT_SC, ucp_Hangul },
{ 306, PT_SC, ucp_Hanunoo },
{ 314, PT_SC, ucp_Hebrew },
{ 321, PT_SC, ucp_Hiragana },
{ 330, PT_SC, ucp_Imperial_Aramaic },
{ 347, PT_SC, ucp_Inherited },
{ 357, PT_SC, ucp_Inscriptional_Pahlavi },
{ 379, PT_SC, ucp_Inscriptional_Parthian },
{ 402, PT_SC, ucp_Javanese },
{ 411, PT_SC, ucp_Kaithi },
{ 418, PT_SC, ucp_Kannada },
{ 426, PT_SC, ucp_Katakana },
{ 435, PT_SC, ucp_Kayah_Li },
{ 444, PT_SC, ucp_Kharoshthi },
{ 455, PT_SC, ucp_Khmer },
{ 461, PT_GC, ucp_L },
{ 463, PT_LAMP, 0 },
{ 466, PT_SC, ucp_Lao },
{ 470, PT_SC, ucp_Latin },
{ 476, PT_SC, ucp_Lepcha },
{ 483, PT_SC, ucp_Limbu },
{ 489, PT_SC, ucp_Linear_B },
{ 498, PT_SC, ucp_Lisu },
{ 503, PT_PC, ucp_Ll },
{ 506, PT_PC, ucp_Lm },
{ 509, PT_PC, ucp_Lo },
{ 512, PT_PC, ucp_Lt },
{ 515, PT_PC, ucp_Lu },
{ 518, PT_SC, ucp_Lycian },
{ 525, PT_SC, ucp_Lydian },
{ 532, PT_GC, ucp_M },
{ 534, PT_SC, ucp_Malayalam },
{ 544, PT_SC, ucp_Mandaic },
{ 552, PT_PC, ucp_Mc },
{ 555, PT_PC, ucp_Me },
{ 558, PT_SC, ucp_Meetei_Mayek },
{ 571, PT_PC, ucp_Mn },
{ 574, PT_SC, ucp_Mongolian },
{ 584, PT_SC, ucp_Myanmar },
{ 592, PT_GC, ucp_N },
{ 594, PT_PC, ucp_Nd },
{ 597, PT_SC, ucp_New_Tai_Lue },
{ 609, PT_SC, ucp_Nko },
{ 613, PT_PC, ucp_Nl },
{ 616, PT_PC, ucp_No },
{ 619, PT_SC, ucp_Ogham },
{ 625, PT_SC, ucp_Ol_Chiki },
{ 634, PT_SC, ucp_Old_Italic },
{ 645, PT_SC, ucp_Old_Persian },
{ 657, PT_SC, ucp_Old_South_Arabian },
{ 675, PT_SC, ucp_Old_Turkic },
{ 686, PT_SC, ucp_Oriya },
{ 692, PT_SC, ucp_Osmanya },
{ 700, PT_GC, ucp_P },
{ 702, PT_PC, ucp_Pc },
{ 705, PT_PC, ucp_Pd },
{ 708, PT_PC, ucp_Pe },
{ 711, PT_PC, ucp_Pf },
{ 714, PT_SC, ucp_Phags_Pa },
{ 723, PT_SC, ucp_Phoenician },
{ 734, PT_PC, ucp_Pi },
{ 737, PT_PC, ucp_Po },
{ 740, PT_PC, ucp_Ps },
{ 743, PT_SC, ucp_Rejang },
{ 750, PT_SC, ucp_Runic },
{ 756, PT_GC, ucp_S },
{ 758, PT_SC, ucp_Samaritan },
{ 768, PT_SC, ucp_Saurashtra },
{ 779, PT_PC, ucp_Sc },
{ 782, PT_SC, ucp_Shavian },
{ 790, PT_SC, ucp_Sinhala },
{ 798, PT_PC, ucp_Sk },
{ 801, PT_PC, ucp_Sm },
{ 804, PT_PC, ucp_So },
{ 807, PT_SC, ucp_Sundanese },
{ 817, PT_SC, ucp_Syloti_Nagri },
{ 830, PT_SC, ucp_Syriac },
{ 837, PT_SC, ucp_Tagalog },
{ 845, PT_SC, ucp_Tagbanwa },
{ 854, PT_SC, ucp_Tai_Le },
{ 861, PT_SC, ucp_Tai_Tham },
{ 870, PT_SC, ucp_Tai_Viet },
{ 879, PT_SC, ucp_Tamil },
{ 885, PT_SC, ucp_Telugu },
{ 892, PT_SC, ucp_Thaana },
{ 899, PT_SC, ucp_Thai },
{ 904, PT_SC, ucp_Tibetan },
{ 912, PT_SC, ucp_Tifinagh },
{ 921, PT_SC, ucp_Ugaritic },
{ 930, PT_SC, ucp_Vai },
{ 934, PT_ALNUM, 0 },
{ 938, PT_PXSPACE, 0 },
{ 942, PT_SPACE, 0 },
{ 946, PT_WORD, 0 },
{ 950, PT_SC, ucp_Yi },
{ 953, PT_GC, ucp_Z },
{ 955, PT_PC, ucp_Zl },
{ 958, PT_PC, ucp_Zp },
{ 961, PT_PC, ucp_Zs }
};
const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table);

View File

@ -6,7 +6,7 @@
and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Copyright (c) 1997-2009 University of Cambridge
Copyright (c) 1997-2010 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@ -104,6 +104,7 @@ while ((t = *data++) != XCL_END)
else /* XCL_PROP & XCL_NOTPROP */
{
int chartype = UCD_CHARTYPE(c);
switch(*data)
{
case PT_ANY:
@ -111,12 +112,13 @@ while ((t = *data++) != XCL_END)
break;
case PT_LAMP:
if ((chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt) ==
(t == XCL_PROP)) return !negated;
if ((chartype == ucp_Lu || chartype == ucp_Ll ||
chartype == ucp_Lt) == (t == XCL_PROP)) return !negated;
break;
case PT_GC:
if ((data[1] == _pcre_ucp_gentype[chartype]) == (t == XCL_PROP)) return !negated;
if ((data[1] == _pcre_ucp_gentype[chartype]) == (t == XCL_PROP))
return !negated;
break;
case PT_PC:
@ -127,6 +129,33 @@ while ((t = *data++) != XCL_END)
if ((data[1] == UCD_SCRIPT(c)) == (t == XCL_PROP)) return !negated;
break;
case PT_ALNUM:
if ((_pcre_ucp_gentype[chartype] == ucp_L ||
_pcre_ucp_gentype[chartype] == ucp_N) == (t == XCL_PROP))
return !negated;
break;
case PT_SPACE: /* Perl space */
if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
== (t == XCL_PROP))
return !negated;
break;
case PT_PXSPACE: /* POSIX space */
if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP))
return !negated;
break;
case PT_WORD:
if ((_pcre_ucp_gentype[chartype] == ucp_L ||
_pcre_ucp_gentype[chartype] == ucp_N || c == CHAR_UNDERSCORE)
== (t == XCL_PROP))
return !negated;
break;
/* This should never occur, but compilers may mutter if there is no
default. */

View File

@ -150,7 +150,10 @@ enum {
ucp_Old_Turkic = G_UNICODE_SCRIPT_OLD_TURKIC,
ucp_Samaritan = G_UNICODE_SCRIPT_SAMARITAN,
ucp_Tai_Tham = G_UNICODE_SCRIPT_TAI_THAM,
ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET
ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET,
ucp_Batak = G_UNICODE_SCRIPT_BATAK,
ucp_Brahmi = G_UNICODE_SCRIPT_BRAHMI,
ucp_Mandaic = G_UNICODE_SCRIPT_MANDAIC
};
#endif