mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2024-12-26 23:46:15 +01:00
Forgotten files
This commit is contained in:
parent
3f059a6a12
commit
fb2809ec99
@ -5,7 +5,7 @@
|
||||
/* This is the public header file for the PCRE library, to be #included by
|
||||
applications that call the PCRE functions.
|
||||
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
Copyright (c) 1997-2010 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -42,9 +42,9 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
/* The current PCRE version information. */
|
||||
|
||||
#define PCRE_MAJOR 8
|
||||
#define PCRE_MINOR 02
|
||||
#define PCRE_MINOR 12
|
||||
#define PCRE_PRERELEASE
|
||||
#define PCRE_DATE 2010-03-19
|
||||
#define PCRE_DATE 2011-01-15
|
||||
|
||||
/* When an application links to a PCRE DLL in Windows, the symbols that are
|
||||
imported have to be identified as such. When building PCRE, the appropriate
|
||||
@ -96,41 +96,44 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
/* Options. Some are compile-time only, some are run-time only, and some are
|
||||
both, so we keep them all distinct. */
|
||||
both, so we keep them all distinct. However, almost all the bits in the options
|
||||
word are now used. In the long run, we may have to re-use some of the
|
||||
compile-time only bits for runtime options, or vice versa. */
|
||||
|
||||
#define PCRE_CASELESS 0x00000001
|
||||
#define PCRE_MULTILINE 0x00000002
|
||||
#define PCRE_DOTALL 0x00000004
|
||||
#define PCRE_EXTENDED 0x00000008
|
||||
#define PCRE_ANCHORED 0x00000010
|
||||
#define PCRE_DOLLAR_ENDONLY 0x00000020
|
||||
#define PCRE_EXTRA 0x00000040
|
||||
#define PCRE_NOTBOL 0x00000080
|
||||
#define PCRE_NOTEOL 0x00000100
|
||||
#define PCRE_UNGREEDY 0x00000200
|
||||
#define PCRE_NOTEMPTY 0x00000400
|
||||
#define PCRE_UTF8 0x00000800
|
||||
#define PCRE_NO_AUTO_CAPTURE 0x00001000
|
||||
#define PCRE_NO_UTF8_CHECK 0x00002000
|
||||
#define PCRE_AUTO_CALLOUT 0x00004000
|
||||
#define PCRE_PARTIAL_SOFT 0x00008000
|
||||
#define PCRE_CASELESS 0x00000001 /* Compile */
|
||||
#define PCRE_MULTILINE 0x00000002 /* Compile */
|
||||
#define PCRE_DOTALL 0x00000004 /* Compile */
|
||||
#define PCRE_EXTENDED 0x00000008 /* Compile */
|
||||
#define PCRE_ANCHORED 0x00000010 /* Compile, exec, DFA exec */
|
||||
#define PCRE_DOLLAR_ENDONLY 0x00000020 /* Compile */
|
||||
#define PCRE_EXTRA 0x00000040 /* Compile */
|
||||
#define PCRE_NOTBOL 0x00000080 /* Exec, DFA exec */
|
||||
#define PCRE_NOTEOL 0x00000100 /* Exec, DFA exec */
|
||||
#define PCRE_UNGREEDY 0x00000200 /* Compile */
|
||||
#define PCRE_NOTEMPTY 0x00000400 /* Exec, DFA exec */
|
||||
#define PCRE_UTF8 0x00000800 /* Compile */
|
||||
#define PCRE_NO_AUTO_CAPTURE 0x00001000 /* Compile */
|
||||
#define PCRE_NO_UTF8_CHECK 0x00002000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_AUTO_CALLOUT 0x00004000 /* Compile */
|
||||
#define PCRE_PARTIAL_SOFT 0x00008000 /* Exec, DFA exec */
|
||||
#define PCRE_PARTIAL 0x00008000 /* Backwards compatible synonym */
|
||||
#define PCRE_DFA_SHORTEST 0x00010000
|
||||
#define PCRE_DFA_RESTART 0x00020000
|
||||
#define PCRE_FIRSTLINE 0x00040000
|
||||
#define PCRE_DUPNAMES 0x00080000
|
||||
#define PCRE_NEWLINE_CR 0x00100000
|
||||
#define PCRE_NEWLINE_LF 0x00200000
|
||||
#define PCRE_NEWLINE_CRLF 0x00300000
|
||||
#define PCRE_NEWLINE_ANY 0x00400000
|
||||
#define PCRE_NEWLINE_ANYCRLF 0x00500000
|
||||
#define PCRE_BSR_ANYCRLF 0x00800000
|
||||
#define PCRE_BSR_UNICODE 0x01000000
|
||||
#define PCRE_JAVASCRIPT_COMPAT 0x02000000
|
||||
#define PCRE_NO_START_OPTIMIZE 0x04000000
|
||||
#define PCRE_NO_START_OPTIMISE 0x04000000
|
||||
#define PCRE_PARTIAL_HARD 0x08000000
|
||||
#define PCRE_NOTEMPTY_ATSTART 0x10000000
|
||||
#define PCRE_DFA_SHORTEST 0x00010000 /* DFA exec */
|
||||
#define PCRE_DFA_RESTART 0x00020000 /* DFA exec */
|
||||
#define PCRE_FIRSTLINE 0x00040000 /* Compile */
|
||||
#define PCRE_DUPNAMES 0x00080000 /* Compile */
|
||||
#define PCRE_NEWLINE_CR 0x00100000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_NEWLINE_LF 0x00200000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_NEWLINE_CRLF 0x00300000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_NEWLINE_ANY 0x00400000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_NEWLINE_ANYCRLF 0x00500000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_BSR_ANYCRLF 0x00800000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_BSR_UNICODE 0x01000000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_JAVASCRIPT_COMPAT 0x02000000 /* Compile */
|
||||
#define PCRE_NO_START_OPTIMIZE 0x04000000 /* Compile, exec, DFA exec */
|
||||
#define PCRE_NO_START_OPTIMISE 0x04000000 /* Synonym */
|
||||
#define PCRE_PARTIAL_HARD 0x08000000 /* Exec, DFA exec */
|
||||
#define PCRE_NOTEMPTY_ATSTART 0x10000000 /* Exec, DFA exec */
|
||||
#define PCRE_UCP 0x20000000 /* Compile */
|
||||
|
||||
/* Exec-time and get/set-time error codes */
|
||||
|
||||
@ -158,6 +161,8 @@ both, so we keep them all distinct. */
|
||||
#define PCRE_ERROR_RECURSIONLIMIT (-21)
|
||||
#define PCRE_ERROR_NULLWSLIMIT (-22) /* No longer actually used */
|
||||
#define PCRE_ERROR_BADNEWLINE (-23)
|
||||
#define PCRE_ERROR_BADOFFSET (-24)
|
||||
#define PCRE_ERROR_SHORTUTF8 (-25)
|
||||
|
||||
/* Request types for pcre_fullinfo() */
|
||||
|
||||
@ -200,6 +205,7 @@ these bits, just add new ones on the end, in order to remain compatible. */
|
||||
#define PCRE_EXTRA_CALLOUT_DATA 0x0004
|
||||
#define PCRE_EXTRA_TABLES 0x0008
|
||||
#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0x0010
|
||||
#define PCRE_EXTRA_MARK 0x0020
|
||||
|
||||
/* Types */
|
||||
|
||||
@ -225,6 +231,7 @@ typedef struct pcre_extra {
|
||||
void *callout_data; /* Data passed back in callouts */
|
||||
const unsigned char *tables; /* Pointer to character tables */
|
||||
unsigned long int match_limit_recursion; /* Max recursive calls to match() */
|
||||
unsigned char **mark; /* For passing back a mark pointer */
|
||||
} pcre_extra;
|
||||
|
||||
/* The structure for passing out data via the pcre_callout_function. We use a
|
||||
|
@ -14,7 +14,7 @@ example ISO-8859-1. When dftables is run, it creates these tables in the
|
||||
current locale. If PCRE is configured with --enable-rebuild-chartables, this
|
||||
happens automatically.
|
||||
|
||||
The following #includes are present because without the gcc 4.x may remove the
|
||||
The following #includes are present because without them gcc 4.x may remove the
|
||||
array definition from the final binary if PCRE is built into a static library
|
||||
and dead code stripping is activated. This leads to link errors. Pulling in the
|
||||
header ensures that the array gets flagged as "someone outside this compilation
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -106,7 +106,7 @@ never stored, so we push them well clear of the normal opcodes. */
|
||||
|
||||
|
||||
/* This table identifies those opcodes that are followed immediately by a
|
||||
character that is to be tested in some way. This makes is possible to
|
||||
character that is to be tested in some way. This makes it possible to
|
||||
centralize the loading of these characters. In the case of Type * etc, the
|
||||
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
|
||||
small value. Non-zero values in the table are the offsets from the opcode where
|
||||
@ -161,8 +161,9 @@ static const uschar coptable[] = {
|
||||
0, 0, /* RREF, NRREF */
|
||||
0, /* DEF */
|
||||
0, 0, /* BRAZERO, BRAMINZERO */
|
||||
0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
|
||||
0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
|
||||
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
|
||||
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
|
||||
0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
|
||||
};
|
||||
|
||||
/* This table identifies those opcodes that inspect a character. It is used to
|
||||
@ -218,8 +219,9 @@ static const uschar poptable[] = {
|
||||
0, 0, /* RREF, NRREF */
|
||||
0, /* DEF */
|
||||
0, 0, /* BRAZERO, BRAMINZERO */
|
||||
0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
|
||||
0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
|
||||
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
|
||||
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
|
||||
0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
|
||||
};
|
||||
|
||||
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
|
||||
@ -473,7 +475,7 @@ if (*first_op == OP_REVERSE)
|
||||
|
||||
{
|
||||
gone_back = (current_subject - max_back < start_subject)?
|
||||
current_subject - start_subject : max_back;
|
||||
(int)(current_subject - start_subject) : max_back;
|
||||
current_subject -= gone_back;
|
||||
}
|
||||
|
||||
@ -490,7 +492,7 @@ if (*first_op == OP_REVERSE)
|
||||
int back = GET(end_code, 2+LINK_SIZE);
|
||||
if (back <= gone_back)
|
||||
{
|
||||
int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
|
||||
int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
|
||||
ADD_NEW_DATA(-bstate, 0, gone_back - back);
|
||||
}
|
||||
end_code += GET(end_code, 1);
|
||||
@ -526,7 +528,7 @@ else
|
||||
((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
|
||||
do
|
||||
{
|
||||
ADD_NEW(end_code - start_code + length, 0);
|
||||
ADD_NEW((int)(end_code - start_code + length), 0);
|
||||
end_code += GET(end_code, 1);
|
||||
length = 1 + LINK_SIZE;
|
||||
}
|
||||
@ -753,8 +755,8 @@ for (;;)
|
||||
if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
|
||||
if (offsetcount >= 2)
|
||||
{
|
||||
offsets[0] = current_subject - start_subject;
|
||||
offsets[1] = ptr - start_subject;
|
||||
offsets[0] = (int)(current_subject - start_subject);
|
||||
offsets[1] = (int)(ptr - start_subject);
|
||||
DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
|
||||
offsets[1] - offsets[0], current_subject));
|
||||
}
|
||||
@ -776,7 +778,7 @@ for (;;)
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_ALT:
|
||||
do { code += GET(code, 1); } while (*code == OP_ALT);
|
||||
ADD_ACTIVE(code - start_code, 0);
|
||||
ADD_ACTIVE((int)(code - start_code), 0);
|
||||
break;
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
@ -784,7 +786,7 @@ for (;;)
|
||||
case OP_SBRA:
|
||||
do
|
||||
{
|
||||
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
|
||||
ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
|
||||
code += GET(code, 1);
|
||||
}
|
||||
while (*code == OP_ALT);
|
||||
@ -793,11 +795,11 @@ for (;;)
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_CBRA:
|
||||
case OP_SCBRA:
|
||||
ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
|
||||
ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE), 0);
|
||||
code += GET(code, 1);
|
||||
while (*code == OP_ALT)
|
||||
{
|
||||
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
|
||||
ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
|
||||
code += GET(code, 1);
|
||||
}
|
||||
break;
|
||||
@ -808,14 +810,14 @@ for (;;)
|
||||
ADD_ACTIVE(state_offset + 1, 0);
|
||||
code += 1 + GET(code, 2);
|
||||
while (*code == OP_ALT) code += GET(code, 1);
|
||||
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
|
||||
ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
|
||||
break;
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_SKIPZERO:
|
||||
code += 1 + GET(code, 2);
|
||||
while (*code == OP_ALT) code += GET(code, 1);
|
||||
ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
|
||||
ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
|
||||
break;
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
@ -829,7 +831,12 @@ for (;;)
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_EOD:
|
||||
if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
|
||||
if (ptr >= end_subject)
|
||||
{
|
||||
if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
|
||||
could_continue = TRUE;
|
||||
else { ADD_ACTIVE(state_offset + 1, 0); }
|
||||
}
|
||||
break;
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
@ -869,7 +876,9 @@ for (;;)
|
||||
|
||||
/*-----------------------------------------------------------------*/
|
||||
case OP_EODN:
|
||||
if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
|
||||
if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
|
||||
could_continue = TRUE;
|
||||
else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
|
||||
{ ADD_ACTIVE(state_offset + 1, 0); }
|
||||
break;
|
||||
|
||||
@ -877,7 +886,9 @@ for (;;)
|
||||
case OP_DOLL:
|
||||
if ((md->moptions & PCRE_NOTEOL) == 0)
|
||||
{
|
||||
if (clen == 0 ||
|
||||
if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
|
||||
could_continue = TRUE;
|
||||
else if (clen == 0 ||
|
||||
((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
|
||||
((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
|
||||
))
|
||||
@ -920,13 +931,37 @@ for (;;)
|
||||
if (utf8) BACKCHAR(temp);
|
||||
#endif
|
||||
GETCHARTEST(d, temp);
|
||||
#ifdef SUPPORT_UCP
|
||||
if ((md->poptions & PCRE_UCP) != 0)
|
||||
{
|
||||
if (d == '_') left_word = TRUE; else
|
||||
{
|
||||
int cat = UCD_CATEGORY(d);
|
||||
left_word = (cat == ucp_L || cat == ucp_N);
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
|
||||
}
|
||||
else left_word = 0;
|
||||
else left_word = FALSE;
|
||||
|
||||
if (clen > 0)
|
||||
{
|
||||
#ifdef SUPPORT_UCP
|
||||
if ((md->poptions & PCRE_UCP) != 0)
|
||||
{
|
||||
if (c == '_') right_word = TRUE; else
|
||||
{
|
||||
int cat = UCD_CATEGORY(c);
|
||||
right_word = (cat == ucp_L || cat == ucp_N);
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
|
||||
else right_word = 0;
|
||||
}
|
||||
else right_word = FALSE;
|
||||
|
||||
if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
|
||||
{ ADD_ACTIVE(state_offset + 1, 0); }
|
||||
@ -953,7 +988,8 @@ for (;;)
|
||||
break;
|
||||
|
||||
case PT_LAMP:
|
||||
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
|
||||
OK = chartype == ucp_Lu || chartype == ucp_Ll ||
|
||||
chartype == ucp_Lt;
|
||||
break;
|
||||
|
||||
case PT_GC:
|
||||
@ -968,6 +1004,30 @@ for (;;)
|
||||
OK = UCD_SCRIPT(c) == code[2];
|
||||
break;
|
||||
|
||||
/* These are specials for combination cases. */
|
||||
|
||||
case PT_ALNUM:
|
||||
OK = _pcre_ucp_gentype[chartype] == ucp_L ||
|
||||
_pcre_ucp_gentype[chartype] == ucp_N;
|
||||
break;
|
||||
|
||||
case PT_SPACE: /* Perl space */
|
||||
OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
|
||||
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
|
||||
break;
|
||||
|
||||
case PT_PXSPACE: /* POSIX space */
|
||||
OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
|
||||
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
|
||||
c == CHAR_FF || c == CHAR_CR;
|
||||
break;
|
||||
|
||||
case PT_WORD:
|
||||
OK = _pcre_ucp_gentype[chartype] == ucp_L ||
|
||||
_pcre_ucp_gentype[chartype] == ucp_N ||
|
||||
c == CHAR_UNDERSCORE;
|
||||
break;
|
||||
|
||||
/* Should never occur, but keep compilers from grumbling. */
|
||||
|
||||
default:
|
||||
@ -1122,7 +1182,8 @@ for (;;)
|
||||
break;
|
||||
|
||||
case PT_LAMP:
|
||||
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
|
||||
OK = chartype == ucp_Lu || chartype == ucp_Ll ||
|
||||
chartype == ucp_Lt;
|
||||
break;
|
||||
|
||||
case PT_GC:
|
||||
@ -1137,6 +1198,30 @@ for (;;)
|
||||
OK = UCD_SCRIPT(c) == code[3];
|
||||
break;
|
||||
|
||||
/* These are specials for combination cases. */
|
||||
|
||||
case PT_ALNUM:
|
||||
OK = _pcre_ucp_gentype[chartype] == ucp_L ||
|
||||
_pcre_ucp_gentype[chartype] == ucp_N;
|
||||
break;
|
||||
|
||||
case PT_SPACE: /* Perl space */
|
||||
OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
|
||||
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
|
||||
break;
|
||||
|
||||
case PT_PXSPACE: /* POSIX space */
|
||||
OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
|
||||
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
|
||||
c == CHAR_FF || c == CHAR_CR;
|
||||
break;
|
||||
|
||||
case PT_WORD:
|
||||
OK = _pcre_ucp_gentype[chartype] == ucp_L ||
|
||||
_pcre_ucp_gentype[chartype] == ucp_N ||
|
||||
c == CHAR_UNDERSCORE;
|
||||
break;
|
||||
|
||||
/* Should never occur, but keep compilers from grumbling. */
|
||||
|
||||
default:
|
||||
@ -1344,7 +1429,8 @@ for (;;)
|
||||
break;
|
||||
|
||||
case PT_LAMP:
|
||||
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
|
||||
OK = chartype == ucp_Lu || chartype == ucp_Ll ||
|
||||
chartype == ucp_Lt;
|
||||
break;
|
||||
|
||||
case PT_GC:
|
||||
@ -1359,6 +1445,30 @@ for (;;)
|
||||
OK = UCD_SCRIPT(c) == code[3];
|
||||
break;
|
||||
|
||||
/* These are specials for combination cases. */
|
||||
|
||||
case PT_ALNUM:
|
||||
OK = _pcre_ucp_gentype[chartype] == ucp_L ||
|
||||
_pcre_ucp_gentype[chartype] == ucp_N;
|
||||
break;
|
||||
|
||||
case PT_SPACE: /* Perl space */
|
||||
OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
|
||||
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
|
||||
break;
|
||||
|
||||
case PT_PXSPACE: /* POSIX space */
|
||||
OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
|
||||
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
|
||||
c == CHAR_FF || c == CHAR_CR;
|
||||
break;
|
||||
|
||||
case PT_WORD:
|
||||
OK = _pcre_ucp_gentype[chartype] == ucp_L ||
|
||||
_pcre_ucp_gentype[chartype] == ucp_N ||
|
||||
c == CHAR_UNDERSCORE;
|
||||
break;
|
||||
|
||||
/* Should never occur, but keep compilers from grumbling. */
|
||||
|
||||
default:
|
||||
@ -1591,7 +1701,8 @@ for (;;)
|
||||
break;
|
||||
|
||||
case PT_LAMP:
|
||||
OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
|
||||
OK = chartype == ucp_Lu || chartype == ucp_Ll ||
|
||||
chartype == ucp_Lt;
|
||||
break;
|
||||
|
||||
case PT_GC:
|
||||
@ -1606,6 +1717,30 @@ for (;;)
|
||||
OK = UCD_SCRIPT(c) == code[5];
|
||||
break;
|
||||
|
||||
/* These are specials for combination cases. */
|
||||
|
||||
case PT_ALNUM:
|
||||
OK = _pcre_ucp_gentype[chartype] == ucp_L ||
|
||||
_pcre_ucp_gentype[chartype] == ucp_N;
|
||||
break;
|
||||
|
||||
case PT_SPACE: /* Perl space */
|
||||
OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
|
||||
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
|
||||
break;
|
||||
|
||||
case PT_PXSPACE: /* POSIX space */
|
||||
OK = _pcre_ucp_gentype[chartype] == ucp_Z ||
|
||||
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
|
||||
c == CHAR_FF || c == CHAR_CR;
|
||||
break;
|
||||
|
||||
case PT_WORD:
|
||||
OK = _pcre_ucp_gentype[chartype] == ucp_L ||
|
||||
_pcre_ucp_gentype[chartype] == ucp_N ||
|
||||
c == CHAR_UNDERSCORE;
|
||||
break;
|
||||
|
||||
/* Should never occur, but keep compilers from grumbling. */
|
||||
|
||||
default:
|
||||
@ -2233,7 +2368,7 @@ for (;;)
|
||||
points to the byte after the end of the class. If there is a
|
||||
quantifier, this is where it will be. */
|
||||
|
||||
next_state_offset = ecode - start_code;
|
||||
next_state_offset = (int)(ecode - start_code);
|
||||
|
||||
switch (*ecode)
|
||||
{
|
||||
@ -2304,7 +2439,7 @@ for (;;)
|
||||
md, /* static match data */
|
||||
code, /* this subexpression's code */
|
||||
ptr, /* where we currently are */
|
||||
ptr - start_subject, /* start offset */
|
||||
(int)(ptr - start_subject), /* start offset */
|
||||
local_offsets, /* offset vector */
|
||||
sizeof(local_offsets)/sizeof(int), /* size of same */
|
||||
local_workspace, /* workspace vector */
|
||||
@ -2315,7 +2450,7 @@ for (;;)
|
||||
|
||||
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
|
||||
if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
|
||||
{ ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
|
||||
{ ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
|
||||
}
|
||||
break;
|
||||
|
||||
@ -2342,9 +2477,9 @@ for (;;)
|
||||
cb.callout_number = code[LINK_SIZE+2];
|
||||
cb.offset_vector = offsets;
|
||||
cb.subject = (PCRE_SPTR)start_subject;
|
||||
cb.subject_length = end_subject - start_subject;
|
||||
cb.start_match = current_subject - start_subject;
|
||||
cb.current_position = ptr - start_subject;
|
||||
cb.subject_length = (int)(end_subject - start_subject);
|
||||
cb.start_match = (int)(current_subject - start_subject);
|
||||
cb.current_position = (int)(ptr - start_subject);
|
||||
cb.pattern_position = GET(code, LINK_SIZE + 3);
|
||||
cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
|
||||
cb.capture_top = 1;
|
||||
@ -2395,7 +2530,7 @@ for (;;)
|
||||
md, /* fixed match data */
|
||||
asscode, /* this subexpression's code */
|
||||
ptr, /* where we currently are */
|
||||
ptr - start_subject, /* start offset */
|
||||
(int)(ptr - start_subject), /* start offset */
|
||||
local_offsets, /* offset vector */
|
||||
sizeof(local_offsets)/sizeof(int), /* size of same */
|
||||
local_workspace, /* workspace vector */
|
||||
@ -2407,7 +2542,7 @@ for (;;)
|
||||
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
|
||||
if ((rc >= 0) ==
|
||||
(condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
|
||||
{ ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
|
||||
{ ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
|
||||
else
|
||||
{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
|
||||
}
|
||||
@ -2428,7 +2563,7 @@ for (;;)
|
||||
md, /* fixed match data */
|
||||
start_code + GET(code, 1), /* this subexpression's code */
|
||||
ptr, /* where we currently are */
|
||||
ptr - start_subject, /* start offset */
|
||||
(int)(ptr - start_subject), /* start offset */
|
||||
local_offsets, /* offset vector */
|
||||
sizeof(local_offsets)/sizeof(int), /* size of same */
|
||||
local_workspace, /* workspace vector */
|
||||
@ -2480,7 +2615,7 @@ for (;;)
|
||||
md, /* fixed match data */
|
||||
code, /* this subexpression's code */
|
||||
ptr, /* where we currently are */
|
||||
ptr - start_subject, /* start offset */
|
||||
(int)(ptr - start_subject), /* start offset */
|
||||
local_offsets, /* offset vector */
|
||||
sizeof(local_offsets)/sizeof(int), /* size of same */
|
||||
local_workspace, /* workspace vector */
|
||||
@ -2497,7 +2632,8 @@ for (;;)
|
||||
|
||||
do { end_subpattern += GET(end_subpattern, 1); }
|
||||
while (*end_subpattern == OP_ALT);
|
||||
next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
|
||||
next_state_offset =
|
||||
(int)(end_subpattern - start_code + LINK_SIZE + 1);
|
||||
|
||||
/* If the end of this subpattern is KETRMAX or KETRMIN, we must
|
||||
arrange for the repeat state also to be added to the relevant list.
|
||||
@ -2505,7 +2641,7 @@ for (;;)
|
||||
|
||||
repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
|
||||
*end_subpattern == OP_KETRMIN)?
|
||||
end_subpattern - start_code - GET(end_subpattern, 1) : -1;
|
||||
(int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
|
||||
|
||||
/* If we have matched an empty string, add the next state at the
|
||||
current character pointer. This is important so that the duplicate
|
||||
@ -2569,9 +2705,9 @@ for (;;)
|
||||
cb.callout_number = code[1];
|
||||
cb.offset_vector = offsets;
|
||||
cb.subject = (PCRE_SPTR)start_subject;
|
||||
cb.subject_length = end_subject - start_subject;
|
||||
cb.start_match = current_subject - start_subject;
|
||||
cb.current_position = ptr - start_subject;
|
||||
cb.subject_length = (int)(end_subject - start_subject);
|
||||
cb.start_match = (int)(current_subject - start_subject);
|
||||
cb.current_position = (int)(ptr - start_subject);
|
||||
cb.pattern_position = GET(code, 2);
|
||||
cb.next_item_length = GET(code, 2 + LINK_SIZE);
|
||||
cb.capture_top = 1;
|
||||
@ -2617,13 +2753,13 @@ for (;;)
|
||||
((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
|
||||
match_count < 0) /* no matches */
|
||||
) && /* And... */
|
||||
ptr >= end_subject && /* Reached end of subject */
|
||||
ptr > current_subject) /* Matched non-empty string */
|
||||
ptr >= end_subject && /* Reached end of subject */
|
||||
ptr > md->start_used_ptr) /* Inspected non-empty string */
|
||||
{
|
||||
if (offsetcount >= 2)
|
||||
{
|
||||
offsets[0] = md->start_used_ptr - start_subject;
|
||||
offsets[1] = end_subject - start_subject;
|
||||
offsets[0] = (int)(md->start_used_ptr - start_subject);
|
||||
offsets[1] = (int)(end_subject - start_subject);
|
||||
}
|
||||
match_count = PCRE_ERROR_PARTIAL;
|
||||
}
|
||||
@ -2708,6 +2844,7 @@ if (re == NULL || subject == NULL || workspace == NULL ||
|
||||
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
|
||||
if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
|
||||
if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
|
||||
if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
|
||||
|
||||
/* We need to find the pointer to any study data before we test for byte
|
||||
flipping, so we scan the extra_data block first. This may set two fields in the
|
||||
@ -2826,16 +2963,14 @@ back the character offset. */
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
|
||||
{
|
||||
if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
|
||||
return PCRE_ERROR_BADUTF8;
|
||||
int tb;
|
||||
if ((tb = _pcre_valid_utf8((uschar *)subject, length)) >= 0)
|
||||
return (tb == length && (options & PCRE_PARTIAL_HARD) != 0)?
|
||||
PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
|
||||
if (start_offset > 0 && start_offset < length)
|
||||
{
|
||||
int tb = ((uschar *)subject)[start_offset];
|
||||
if (tb > 127)
|
||||
{
|
||||
tb &= 0xc0;
|
||||
if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
|
||||
}
|
||||
tb = ((USPTR)subject)[start_offset] & 0xc0;
|
||||
if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -2922,9 +3057,11 @@ for (;;)
|
||||
|
||||
/* There are some optimizations that avoid running the match if a known
|
||||
starting point is not found. However, there is an option that disables
|
||||
these, for testing and for ensuring that all callouts do actually occur. */
|
||||
these, for testing and for ensuring that all callouts do actually occur.
|
||||
The option can be set in the regex by (*NO_START_OPT) or passed in
|
||||
match-time options. */
|
||||
|
||||
if ((options & PCRE_NO_START_OPTIMIZE) == 0)
|
||||
if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
|
||||
{
|
||||
/* Advance to a known first byte. */
|
||||
|
||||
@ -2982,8 +3119,16 @@ for (;;)
|
||||
while (current_subject < end_subject)
|
||||
{
|
||||
register unsigned int c = *current_subject;
|
||||
if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
|
||||
else break;
|
||||
if ((start_bits[c/8] & (1 << (c&7))) == 0)
|
||||
{
|
||||
current_subject++;
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8)
|
||||
while(current_subject < end_subject &&
|
||||
(*current_subject & 0xc0) == 0x80) current_subject++;
|
||||
#endif
|
||||
}
|
||||
else break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -408,9 +408,10 @@ capturing parenthesis numbers in back references. */
|
||||
|
||||
/* When UTF-8 encoding is being used, a character is no longer just a single
|
||||
byte. The macros for character handling generate simple sequences when used in
|
||||
byte-mode, and more complicated ones for UTF-8 characters. BACKCHAR should
|
||||
never be called in byte mode. To make sure it can never even appear when UTF-8
|
||||
support is omitted, we don't even define it. */
|
||||
byte-mode, and more complicated ones for UTF-8 characters. GETCHARLENTEST is
|
||||
not used when UTF-8 is not supported, so it is not defined, and BACKCHAR should
|
||||
never be called in byte mode. To make sure they can never even appear when
|
||||
UTF-8 support is omitted, we don't even define them. */
|
||||
|
||||
#ifndef SUPPORT_UTF8
|
||||
#define GETCHAR(c, eptr) c = *eptr;
|
||||
@ -418,43 +419,83 @@ support is omitted, we don't even define it. */
|
||||
#define GETCHARINC(c, eptr) c = *eptr++;
|
||||
#define GETCHARINCTEST(c, eptr) c = *eptr++;
|
||||
#define GETCHARLEN(c, eptr, len) c = *eptr;
|
||||
/* #define GETCHARLENTEST(c, eptr, len) */
|
||||
/* #define BACKCHAR(eptr) */
|
||||
|
||||
#else /* SUPPORT_UTF8 */
|
||||
|
||||
/* These macros were originally written in the form of loops that used data
|
||||
from the tables whose names start with _pcre_utf8_table. They were rewritten by
|
||||
a user so as not to use loops, because in some environments this gives a
|
||||
significant performance advantage, and it seems never to do any harm. */
|
||||
|
||||
/* Base macro to pick up the remaining bytes of a UTF-8 character, not
|
||||
advancing the pointer. */
|
||||
|
||||
#define GETUTF8(c, eptr) \
|
||||
{ \
|
||||
if ((c & 0x20) == 0) \
|
||||
c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
|
||||
else if ((c & 0x10) == 0) \
|
||||
c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
|
||||
else if ((c & 0x08) == 0) \
|
||||
c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
|
||||
((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
|
||||
else if ((c & 0x04) == 0) \
|
||||
c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
|
||||
((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
|
||||
(eptr[4] & 0x3f); \
|
||||
else \
|
||||
c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
|
||||
((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
|
||||
((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
|
||||
}
|
||||
|
||||
/* Get the next UTF-8 character, not advancing the pointer. This is called when
|
||||
we know we are in UTF-8 mode. */
|
||||
|
||||
#define GETCHAR(c, eptr) \
|
||||
c = *eptr; \
|
||||
if (c >= 0xc0) \
|
||||
{ \
|
||||
int gcii; \
|
||||
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
|
||||
int gcss = 6*gcaa; \
|
||||
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
|
||||
for (gcii = 1; gcii <= gcaa; gcii++) \
|
||||
{ \
|
||||
gcss -= 6; \
|
||||
c |= (eptr[gcii] & 0x3f) << gcss; \
|
||||
} \
|
||||
}
|
||||
if (c >= 0xc0) GETUTF8(c, eptr);
|
||||
|
||||
/* Get the next UTF-8 character, testing for UTF-8 mode, and not advancing the
|
||||
pointer. */
|
||||
|
||||
#define GETCHARTEST(c, eptr) \
|
||||
c = *eptr; \
|
||||
if (utf8 && c >= 0xc0) \
|
||||
if (utf8 && c >= 0xc0) GETUTF8(c, eptr);
|
||||
|
||||
/* Base macro to pick up the remaining bytes of a UTF-8 character, advancing
|
||||
the pointer. */
|
||||
|
||||
#define GETUTF8INC(c, eptr) \
|
||||
{ \
|
||||
int gcii; \
|
||||
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
|
||||
int gcss = 6*gcaa; \
|
||||
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
|
||||
for (gcii = 1; gcii <= gcaa; gcii++) \
|
||||
if ((c & 0x20) == 0) \
|
||||
c = ((c & 0x1f) << 6) | (*eptr++ & 0x3f); \
|
||||
else if ((c & 0x10) == 0) \
|
||||
{ \
|
||||
gcss -= 6; \
|
||||
c |= (eptr[gcii] & 0x3f) << gcss; \
|
||||
c = ((c & 0x0f) << 12) | ((*eptr & 0x3f) << 6) | (eptr[1] & 0x3f); \
|
||||
eptr += 2; \
|
||||
} \
|
||||
else if ((c & 0x08) == 0) \
|
||||
{ \
|
||||
c = ((c & 0x07) << 18) | ((*eptr & 0x3f) << 12) | \
|
||||
((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
|
||||
eptr += 3; \
|
||||
} \
|
||||
else if ((c & 0x04) == 0) \
|
||||
{ \
|
||||
c = ((c & 0x03) << 24) | ((*eptr & 0x3f) << 18) | \
|
||||
((eptr[1] & 0x3f) << 12) | ((eptr[2] & 0x3f) << 6) | \
|
||||
(eptr[3] & 0x3f); \
|
||||
eptr += 4; \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
c = ((c & 0x01) << 30) | ((*eptr & 0x3f) << 24) | \
|
||||
((eptr[1] & 0x3f) << 18) | ((eptr[2] & 0x3f) << 12) | \
|
||||
((eptr[3] & 0x3f) << 6) | (eptr[4] & 0x3f); \
|
||||
eptr += 5; \
|
||||
} \
|
||||
}
|
||||
|
||||
@ -463,31 +504,49 @@ know we are in UTF-8 mode. */
|
||||
|
||||
#define GETCHARINC(c, eptr) \
|
||||
c = *eptr++; \
|
||||
if (c >= 0xc0) \
|
||||
{ \
|
||||
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
|
||||
int gcss = 6*gcaa; \
|
||||
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
|
||||
while (gcaa-- > 0) \
|
||||
{ \
|
||||
gcss -= 6; \
|
||||
c |= (*eptr++ & 0x3f) << gcss; \
|
||||
} \
|
||||
}
|
||||
if (c >= 0xc0) GETUTF8INC(c, eptr);
|
||||
|
||||
/* Get the next character, testing for UTF-8 mode, and advancing the pointer */
|
||||
/* Get the next character, testing for UTF-8 mode, and advancing the pointer.
|
||||
This is called when we don't know if we are in UTF-8 mode. */
|
||||
|
||||
#define GETCHARINCTEST(c, eptr) \
|
||||
c = *eptr++; \
|
||||
if (utf8 && c >= 0xc0) \
|
||||
if (utf8 && c >= 0xc0) GETUTF8INC(c, eptr);
|
||||
|
||||
/* Base macro to pick up the remaining bytes of a UTF-8 character, not
|
||||
advancing the pointer, incrementing the length. */
|
||||
|
||||
#define GETUTF8LEN(c, eptr, len) \
|
||||
{ \
|
||||
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
|
||||
int gcss = 6*gcaa; \
|
||||
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
|
||||
while (gcaa-- > 0) \
|
||||
if ((c & 0x20) == 0) \
|
||||
{ \
|
||||
gcss -= 6; \
|
||||
c |= (*eptr++ & 0x3f) << gcss; \
|
||||
c = ((c & 0x1f) << 6) | (eptr[1] & 0x3f); \
|
||||
len++; \
|
||||
} \
|
||||
else if ((c & 0x10) == 0) \
|
||||
{ \
|
||||
c = ((c & 0x0f) << 12) | ((eptr[1] & 0x3f) << 6) | (eptr[2] & 0x3f); \
|
||||
len += 2; \
|
||||
} \
|
||||
else if ((c & 0x08) == 0) \
|
||||
{\
|
||||
c = ((c & 0x07) << 18) | ((eptr[1] & 0x3f) << 12) | \
|
||||
((eptr[2] & 0x3f) << 6) | (eptr[3] & 0x3f); \
|
||||
len += 3; \
|
||||
} \
|
||||
else if ((c & 0x04) == 0) \
|
||||
{ \
|
||||
c = ((c & 0x03) << 24) | ((eptr[1] & 0x3f) << 18) | \
|
||||
((eptr[2] & 0x3f) << 12) | ((eptr[3] & 0x3f) << 6) | \
|
||||
(eptr[4] & 0x3f); \
|
||||
len += 4; \
|
||||
} \
|
||||
else \
|
||||
{\
|
||||
c = ((c & 0x01) << 30) | ((eptr[1] & 0x3f) << 24) | \
|
||||
((eptr[2] & 0x3f) << 18) | ((eptr[3] & 0x3f) << 12) | \
|
||||
((eptr[4] & 0x3f) << 6) | (eptr[5] & 0x3f); \
|
||||
len += 5; \
|
||||
} \
|
||||
}
|
||||
|
||||
@ -496,39 +555,15 @@ if there are extra bytes. This is called when we know we are in UTF-8 mode. */
|
||||
|
||||
#define GETCHARLEN(c, eptr, len) \
|
||||
c = *eptr; \
|
||||
if (c >= 0xc0) \
|
||||
{ \
|
||||
int gcii; \
|
||||
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
|
||||
int gcss = 6*gcaa; \
|
||||
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
|
||||
for (gcii = 1; gcii <= gcaa; gcii++) \
|
||||
{ \
|
||||
gcss -= 6; \
|
||||
c |= (eptr[gcii] & 0x3f) << gcss; \
|
||||
} \
|
||||
len += gcaa; \
|
||||
}
|
||||
if (c >= 0xc0) GETUTF8LEN(c, eptr, len);
|
||||
|
||||
/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
|
||||
pointer, incrementing length if there are extra bytes. This is called when we
|
||||
know we are in UTF-8 mode. */
|
||||
do not know if we are in UTF-8 mode. */
|
||||
|
||||
#define GETCHARLENTEST(c, eptr, len) \
|
||||
c = *eptr; \
|
||||
if (utf8 && c >= 0xc0) \
|
||||
{ \
|
||||
int gcii; \
|
||||
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
|
||||
int gcss = 6*gcaa; \
|
||||
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
|
||||
for (gcii = 1; gcii <= gcaa; gcii++) \
|
||||
{ \
|
||||
gcss -= 6; \
|
||||
c |= (eptr[gcii] & 0x3f) << gcss; \
|
||||
} \
|
||||
len += gcaa; \
|
||||
}
|
||||
if (utf8 && c >= 0xc0) GETUTF8LEN(c, eptr, len);
|
||||
|
||||
/* If the pointer is not at the start of a character, move it back until
|
||||
it is. This is called only in UTF-8 mode - we don't put a test within the macro
|
||||
@ -536,7 +571,7 @@ because almost all calls are already within a block of UTF-8 only code. */
|
||||
|
||||
#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--
|
||||
|
||||
#endif
|
||||
#endif /* SUPPORT_UTF8 */
|
||||
|
||||
|
||||
/* In case there is no definition of offsetof() provided - though any proper
|
||||
@ -580,7 +615,7 @@ time, run time, or study time, respectively. */
|
||||
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
|
||||
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
|
||||
PCRE_DUPNAMES|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
|
||||
PCRE_JAVASCRIPT_COMPAT)
|
||||
PCRE_JAVASCRIPT_COMPAT|PCRE_UCP|PCRE_NO_START_OPTIMIZE)
|
||||
|
||||
#define PUBLIC_EXEC_OPTIONS \
|
||||
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
|
||||
@ -620,7 +655,7 @@ variable-length repeat, or a anything other than literal characters. */
|
||||
environments where these macros are defined elsewhere. Unfortunately, there
|
||||
is no way to do the same for the typedef. */
|
||||
|
||||
typedef gboolean BOOL;
|
||||
typedef gboolean BOOL;
|
||||
|
||||
/* If PCRE is to support UTF-8 on EBCDIC platforms, we cannot use normal
|
||||
character constants like '*' because the compiler would emit their EBCDIC code,
|
||||
@ -870,6 +905,7 @@ so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
|
||||
#define STRING_COMMIT0 "COMMIT\0"
|
||||
#define STRING_F0 "F\0"
|
||||
#define STRING_FAIL0 "FAIL\0"
|
||||
#define STRING_MARK0 "MARK\0"
|
||||
#define STRING_PRUNE0 "PRUNE\0"
|
||||
#define STRING_SKIP0 "SKIP\0"
|
||||
#define STRING_THEN "THEN"
|
||||
@ -891,14 +927,16 @@ so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
|
||||
|
||||
#define STRING_DEFINE "DEFINE"
|
||||
|
||||
#define STRING_CR_RIGHTPAR "CR)"
|
||||
#define STRING_LF_RIGHTPAR "LF)"
|
||||
#define STRING_CRLF_RIGHTPAR "CRLF)"
|
||||
#define STRING_ANY_RIGHTPAR "ANY)"
|
||||
#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)"
|
||||
#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
|
||||
#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
|
||||
#define STRING_UTF8_RIGHTPAR "UTF8)"
|
||||
#define STRING_CR_RIGHTPAR "CR)"
|
||||
#define STRING_LF_RIGHTPAR "LF)"
|
||||
#define STRING_CRLF_RIGHTPAR "CRLF)"
|
||||
#define STRING_ANY_RIGHTPAR "ANY)"
|
||||
#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)"
|
||||
#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
|
||||
#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
|
||||
#define STRING_UTF8_RIGHTPAR "UTF8)"
|
||||
#define STRING_UCP_RIGHTPAR "UCP)"
|
||||
#define STRING_NO_START_OPT_RIGHTPAR "NO_START_OPT)"
|
||||
|
||||
#else /* SUPPORT_UTF8 */
|
||||
|
||||
@ -1122,6 +1160,7 @@ only. */
|
||||
#define STRING_COMMIT0 STR_C STR_O STR_M STR_M STR_I STR_T "\0"
|
||||
#define STRING_F0 STR_F "\0"
|
||||
#define STRING_FAIL0 STR_F STR_A STR_I STR_L "\0"
|
||||
#define STRING_MARK0 STR_M STR_A STR_R STR_K "\0"
|
||||
#define STRING_PRUNE0 STR_P STR_R STR_U STR_N STR_E "\0"
|
||||
#define STRING_SKIP0 STR_S STR_K STR_I STR_P "\0"
|
||||
#define STRING_THEN STR_T STR_H STR_E STR_N
|
||||
@ -1143,14 +1182,16 @@ only. */
|
||||
|
||||
#define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E
|
||||
|
||||
#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
|
||||
#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
|
||||
#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
|
||||
#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
|
||||
#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
|
||||
#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
|
||||
#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
|
||||
#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
|
||||
#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
|
||||
#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
|
||||
#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
|
||||
#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
|
||||
#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
|
||||
#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
|
||||
#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
|
||||
#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
|
||||
#define STRING_UCP_RIGHTPAR STR_U STR_C STR_P STR_RIGHT_PARENTHESIS
|
||||
#define STRING_NO_START_OPT_RIGHTPAR STR_N STR_O STR_UNDERSCORE STR_S STR_T STR_A STR_R STR_T STR_UNDERSCORE STR_O STR_P STR_T STR_RIGHT_PARENTHESIS
|
||||
|
||||
#endif /* SUPPORT_UTF8 */
|
||||
|
||||
@ -1183,9 +1224,13 @@ only. */
|
||||
|
||||
#define PT_ANY 0 /* Any property - matches all chars */
|
||||
#define PT_LAMP 1 /* L& - the union of Lu, Ll, Lt */
|
||||
#define PT_GC 2 /* General characteristic (e.g. L) */
|
||||
#define PT_PC 3 /* Particular characteristic (e.g. Lu) */
|
||||
#define PT_GC 2 /* Specified general characteristic (e.g. L) */
|
||||
#define PT_PC 3 /* Specified particular characteristic (e.g. Lu) */
|
||||
#define PT_SC 4 /* Script (e.g. Han) */
|
||||
#define PT_ALNUM 5 /* Alphanumeric - the union of L and N */
|
||||
#define PT_SPACE 6 /* Perl space - Z plus 9,10,12,13 */
|
||||
#define PT_PXSPACE 7 /* POSIX space - Z plus 9,10,11,12,13 */
|
||||
#define PT_WORD 8 /* Word - L plus N plus underscore */
|
||||
|
||||
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
|
||||
contain UTF-8 characters with values greater than 255. */
|
||||
@ -1202,9 +1247,15 @@ contain UTF-8 characters with values greater than 255. */
|
||||
/* These are escaped items that aren't just an encoding of a particular data
|
||||
value such as \n. They must have non-zero values, as check_escape() returns
|
||||
their negation. Also, they must appear in the same order as in the opcode
|
||||
definitions below, up to ESC_z. There's a dummy for OP_ANY because it
|
||||
corresponds to "." rather than an escape sequence, and another for OP_ALLANY
|
||||
(which is used for [^] in JavaScript compatibility mode).
|
||||
definitions below, up to ESC_z. There's a dummy for OP_ALLANY because it
|
||||
corresponds to "." in DOTALL mode rather than an escape sequence. It is also
|
||||
used for [^] in JavaScript compatibility mode. In non-DOTALL mode, "." behaves
|
||||
like \N.
|
||||
|
||||
The special values ESC_DU, ESC_du, etc. are used instead of ESC_D, ESC_d, etc.
|
||||
when PCRE_UCP is set, when replacement of \d etc by \p sequences is required.
|
||||
They must be contiguous, and remain in order so that the replacements can be
|
||||
looked up from a table.
|
||||
|
||||
The final escape must be ESC_REF as subsequent values are used for
|
||||
backreferences (\1, \2, \3, etc). There are two tests in the code for an escape
|
||||
@ -1214,11 +1265,12 @@ put in between that don't consume a character, that code will have to change.
|
||||
*/
|
||||
|
||||
enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
|
||||
ESC_W, ESC_w, ESC_dum1, ESC_dum2, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
|
||||
ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, ESC_E, ESC_Q, ESC_g, ESC_k,
|
||||
ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
|
||||
ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z,
|
||||
ESC_E, ESC_Q, ESC_g, ESC_k,
|
||||
ESC_DU, ESC_du, ESC_SU, ESC_su, ESC_WU, ESC_wu,
|
||||
ESC_REF };
|
||||
|
||||
|
||||
/* Opcode table: Starting from 1 (i.e. after OP_END), the values up to
|
||||
OP_EOD must correspond in order to the list of escapes immediately above.
|
||||
|
||||
@ -1242,8 +1294,8 @@ enum {
|
||||
OP_WHITESPACE, /* 9 \s */
|
||||
OP_NOT_WORDCHAR, /* 10 \W */
|
||||
OP_WORDCHAR, /* 11 \w */
|
||||
OP_ANY, /* 12 Match any character (subject to DOTALL) */
|
||||
OP_ALLANY, /* 13 Match any character (not subject to DOTALL) */
|
||||
OP_ANY, /* 12 Match any character except newline */
|
||||
OP_ALLANY, /* 13 Match any character */
|
||||
OP_ANYBYTE, /* 14 Match any byte (\C); different to OP_ANY for UTF-8 */
|
||||
OP_NOTPROP, /* 15 \P (not Unicode property) */
|
||||
OP_PROP, /* 16 \p (Unicode property) */
|
||||
@ -1373,20 +1425,24 @@ enum {
|
||||
|
||||
/* These are backtracking control verbs */
|
||||
|
||||
OP_PRUNE, /* 107 */
|
||||
OP_SKIP, /* 108 */
|
||||
OP_THEN, /* 109 */
|
||||
OP_COMMIT, /* 110 */
|
||||
OP_MARK, /* 107 always has an argument */
|
||||
OP_PRUNE, /* 108 */
|
||||
OP_PRUNE_ARG, /* 109 same, but with argument */
|
||||
OP_SKIP, /* 110 */
|
||||
OP_SKIP_ARG, /* 111 same, but with argument */
|
||||
OP_THEN, /* 112 */
|
||||
OP_THEN_ARG, /* 113 same, but with argument */
|
||||
OP_COMMIT, /* 114 */
|
||||
|
||||
/* These are forced failure and success verbs */
|
||||
|
||||
OP_FAIL, /* 111 */
|
||||
OP_ACCEPT, /* 112 */
|
||||
OP_CLOSE, /* 113 Used before OP_ACCEPT to close open captures */
|
||||
OP_FAIL, /* 115 */
|
||||
OP_ACCEPT, /* 116 */
|
||||
OP_CLOSE, /* 117 Used before OP_ACCEPT to close open captures */
|
||||
|
||||
/* This is used to skip a subpattern with a {0} quantifier */
|
||||
|
||||
OP_SKIPZERO, /* 114 */
|
||||
OP_SKIPZERO, /* 118 */
|
||||
|
||||
/* This is not an opcode, but is used to check that tables indexed by opcode
|
||||
are the correct length, in order to catch updating errors - there have been
|
||||
@ -1397,7 +1453,7 @@ enum {
|
||||
|
||||
/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
|
||||
definitions that follow must also be updated to match. There are also tables
|
||||
called "coptable" cna "poptable" in pcre_dfa_exec.c that must be updated. */
|
||||
called "coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */
|
||||
|
||||
|
||||
/* This macro defines textual names for all the opcodes. These are used only
|
||||
@ -1422,7 +1478,8 @@ for debugging. The macro is referenced only in pcre_printint.c. */
|
||||
"Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
|
||||
"Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def", \
|
||||
"Brazero", "Braminzero", \
|
||||
"*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
|
||||
"*MARK", "*PRUNE", "*PRUNE", "*SKIP", "*SKIP", \
|
||||
"*THEN", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
|
||||
"Close", "Skip zero"
|
||||
|
||||
|
||||
@ -1488,8 +1545,10 @@ in UTF-8 mode. The code that uses this table must know about such things. */
|
||||
3, 3, /* RREF, NRREF */ \
|
||||
1, /* DEF */ \
|
||||
1, 1, /* BRAZERO, BRAMINZERO */ \
|
||||
1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \
|
||||
1, 1, 3, 1 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
|
||||
3, 1, 3, /* MARK, PRUNE, PRUNE_ARG */ \
|
||||
1, 3, /* SKIP, SKIP_ARG */ \
|
||||
1+LINK_SIZE, 3+LINK_SIZE, /* THEN, THEN_ARG */ \
|
||||
1, 1, 1, 3, 1 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
|
||||
|
||||
|
||||
/* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion"
|
||||
@ -1507,7 +1566,8 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
|
||||
ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
|
||||
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
|
||||
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
|
||||
ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERRCOUNT };
|
||||
ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERR66, ERR67, ERR68,
|
||||
ERRCOUNT };
|
||||
|
||||
/* The real format of the start of the pcre block; the index of names and the
|
||||
code vector run on as long as necessary after the end. We store an explicit
|
||||
@ -1650,6 +1710,7 @@ typedef struct match_data {
|
||||
BOOL noteol; /* NOTEOL flag */
|
||||
BOOL utf8; /* UTF8 flag */
|
||||
BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */
|
||||
BOOL use_ucp; /* PCRE_UCP flag */
|
||||
BOOL endonly; /* Dollar not before final \n */
|
||||
BOOL notempty; /* Empty string match not wanted */
|
||||
BOOL notempty_atstart; /* Empty string match at start not wanted */
|
||||
@ -1669,6 +1730,7 @@ typedef struct match_data {
|
||||
int eptrn; /* Next free eptrblock */
|
||||
recursion_info *recursive; /* Linked list of recursion data */
|
||||
void *callout_data; /* To pass back to callouts */
|
||||
const uschar *mark; /* Mark pointer to pass back */
|
||||
} match_data;
|
||||
|
||||
/* A similar structure is used for the same purpose by the DFA matching
|
||||
@ -1764,7 +1826,7 @@ extern BOOL _pcre_is_newline(USPTR, int, USPTR, int *, BOOL);
|
||||
extern int _pcre_ord2utf8(int, uschar *);
|
||||
extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
|
||||
const pcre_study_data *, pcre_study_data *);
|
||||
#define _pcre_valid_utf8(u, i) TRUE
|
||||
#define _pcre_valid_utf8(USPTR, int) TRUE
|
||||
extern BOOL _pcre_was_newline(USPTR, int, USPTR, int *, BOOL);
|
||||
extern BOOL _pcre_xclass(int, const uschar *);
|
||||
|
||||
|
@ -48,6 +48,7 @@ supporting functions. */
|
||||
|
||||
#include "pcre_internal.h"
|
||||
|
||||
#define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
|
||||
|
||||
/* Returns from set_start_bits() */
|
||||
|
||||
@ -413,6 +414,18 @@ for (;;)
|
||||
#endif
|
||||
break;
|
||||
|
||||
/* Skip these, but we need to add in the name length. */
|
||||
|
||||
case OP_MARK:
|
||||
case OP_PRUNE_ARG:
|
||||
case OP_SKIP_ARG:
|
||||
cc += _pcre_OP_lengths[op] + cc[1];
|
||||
break;
|
||||
|
||||
case OP_THEN_ARG:
|
||||
cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
|
||||
break;
|
||||
|
||||
/* For the record, these are the opcodes that are matched by "default":
|
||||
OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
|
||||
OP_THEN. */
|
||||
@ -431,25 +444,121 @@ for (;;)
|
||||
* Set a bit and maybe its alternate case *
|
||||
*************************************************/
|
||||
|
||||
/* Given a character, set its bit in the table, and also the bit for the other
|
||||
version of a letter if we are caseless.
|
||||
/* Given a character, set its first byte's bit in the table, and also the
|
||||
corresponding bit for the other version of a letter if we are caseless. In
|
||||
UTF-8 mode, for characters greater than 127, we can only do the caseless thing
|
||||
when Unicode property support is available.
|
||||
|
||||
Arguments:
|
||||
start_bits points to the bit map
|
||||
c is the character
|
||||
p points to the character
|
||||
caseless the caseless flag
|
||||
cd the block with char table pointers
|
||||
utf8 TRUE for UTF-8 mode
|
||||
|
||||
Returns: nothing
|
||||
Returns: pointer after the character
|
||||
*/
|
||||
|
||||
static const uschar *
|
||||
set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
|
||||
compile_data *cd, BOOL utf8)
|
||||
{
|
||||
unsigned int c = *p;
|
||||
|
||||
SET_BIT(c);
|
||||
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8 && c > 127)
|
||||
{
|
||||
GETCHARINC(c, p);
|
||||
#ifdef SUPPORT_UCP
|
||||
if (caseless)
|
||||
{
|
||||
uschar buff[8];
|
||||
c = UCD_OTHERCASE(c);
|
||||
(void)_pcre_ord2utf8(c, buff);
|
||||
SET_BIT(buff[0]);
|
||||
}
|
||||
#endif
|
||||
return p;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Not UTF-8 mode, or character is less than 127. */
|
||||
|
||||
if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
|
||||
return p + 1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Set bits for a positive character type *
|
||||
*************************************************/
|
||||
|
||||
/* This function sets starting bits for a character type. In UTF-8 mode, we can
|
||||
only do a direct setting for bytes less than 128, as otherwise there can be
|
||||
confusion with bytes in the middle of UTF-8 characters. In a "traditional"
|
||||
environment, the tables will only recognize ASCII characters anyway, but in at
|
||||
least one Windows environment, some higher bytes bits were set in the tables.
|
||||
So we deal with that case by considering the UTF-8 encoding.
|
||||
|
||||
Arguments:
|
||||
start_bits the starting bitmap
|
||||
cbit type the type of character wanted
|
||||
table_limit 32 for non-UTF-8; 16 for UTF-8
|
||||
cd the block with char table pointers
|
||||
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
static void
|
||||
set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless,
|
||||
set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
|
||||
compile_data *cd)
|
||||
{
|
||||
start_bits[c/8] |= (1 << (c&7));
|
||||
if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
|
||||
start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
|
||||
register int c;
|
||||
for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
|
||||
if (table_limit == 32) return;
|
||||
for (c = 128; c < 256; c++)
|
||||
{
|
||||
if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
|
||||
{
|
||||
uschar buff[8];
|
||||
(void)_pcre_ord2utf8(c, buff);
|
||||
SET_BIT(buff[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Set bits for a negative character type *
|
||||
*************************************************/
|
||||
|
||||
/* This function sets starting bits for a negative character type such as \D.
|
||||
In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
|
||||
otherwise there can be confusion with bytes in the middle of UTF-8 characters.
|
||||
Unlike in the positive case, where we can set appropriate starting bits for
|
||||
specific high-valued UTF-8 characters, in this case we have to set the bits for
|
||||
all high-valued characters. The lowest is 0xc2, but we overkill by starting at
|
||||
0xc0 (192) for simplicity.
|
||||
|
||||
Arguments:
|
||||
start_bits the starting bitmap
|
||||
cbit type the type of character wanted
|
||||
table_limit 32 for non-UTF-8; 16 for UTF-8
|
||||
cd the block with char table pointers
|
||||
|
||||
Returns: nothing
|
||||
*/
|
||||
|
||||
static void
|
||||
set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
|
||||
compile_data *cd)
|
||||
{
|
||||
register int c;
|
||||
for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
|
||||
if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
|
||||
}
|
||||
|
||||
|
||||
@ -484,6 +593,7 @@ set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
|
||||
{
|
||||
register int c;
|
||||
int yield = SSB_DONE;
|
||||
int table_limit = utf8? 16:32;
|
||||
|
||||
#if 0
|
||||
/* ========================================================================= */
|
||||
@ -607,12 +717,7 @@ do
|
||||
case OP_QUERY:
|
||||
case OP_MINQUERY:
|
||||
case OP_POSQUERY:
|
||||
set_table_bit(start_bits, tcode[1], caseless, cd);
|
||||
tcode += 2;
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8 && tcode[-1] >= 0xc0)
|
||||
tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
|
||||
#endif
|
||||
tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
|
||||
break;
|
||||
|
||||
/* Single-char upto sets the bit and tries the next */
|
||||
@ -620,12 +725,7 @@ do
|
||||
case OP_UPTO:
|
||||
case OP_MINUPTO:
|
||||
case OP_POSUPTO:
|
||||
set_table_bit(start_bits, tcode[3], caseless, cd);
|
||||
tcode += 4;
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8 && tcode[-1] >= 0xc0)
|
||||
tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
|
||||
#endif
|
||||
tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
|
||||
break;
|
||||
|
||||
/* At least one single char sets the bit and stops */
|
||||
@ -638,59 +738,86 @@ do
|
||||
case OP_PLUS:
|
||||
case OP_MINPLUS:
|
||||
case OP_POSPLUS:
|
||||
set_table_bit(start_bits, tcode[1], caseless, cd);
|
||||
(void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
|
||||
try_next = FALSE;
|
||||
break;
|
||||
|
||||
/* Single character type sets the bits and stops */
|
||||
/* Special spacing and line-terminating items. These recognize specific
|
||||
lists of characters. The difference between VSPACE and ANYNL is that the
|
||||
latter can match the two-character CRLF sequence, but that is not
|
||||
relevant for finding the first character, so their code here is
|
||||
identical. */
|
||||
|
||||
case OP_HSPACE:
|
||||
SET_BIT(0x09);
|
||||
SET_BIT(0x20);
|
||||
if (utf8)
|
||||
{
|
||||
SET_BIT(0xC2); /* For U+00A0 */
|
||||
SET_BIT(0xE1); /* For U+1680, U+180E */
|
||||
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
|
||||
SET_BIT(0xE3); /* For U+3000 */
|
||||
}
|
||||
else SET_BIT(0xA0);
|
||||
try_next = FALSE;
|
||||
break;
|
||||
|
||||
case OP_ANYNL:
|
||||
case OP_VSPACE:
|
||||
SET_BIT(0x0A);
|
||||
SET_BIT(0x0B);
|
||||
SET_BIT(0x0C);
|
||||
SET_BIT(0x0D);
|
||||
if (utf8)
|
||||
{
|
||||
SET_BIT(0xC2); /* For U+0085 */
|
||||
SET_BIT(0xE2); /* For U+2028, U+2029 */
|
||||
}
|
||||
else SET_BIT(0x85);
|
||||
try_next = FALSE;
|
||||
break;
|
||||
|
||||
/* Single character types set the bits and stop. Note that if PCRE_UCP
|
||||
is set, we do not see these op codes because \d etc are converted to
|
||||
properties. Therefore, these apply in the case when only characters less
|
||||
than 256 are recognized to match the types. */
|
||||
|
||||
case OP_NOT_DIGIT:
|
||||
for (c = 0; c < 32; c++)
|
||||
start_bits[c] |= ~cd->cbits[c+cbit_digit];
|
||||
set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
|
||||
try_next = FALSE;
|
||||
break;
|
||||
|
||||
case OP_DIGIT:
|
||||
for (c = 0; c < 32; c++)
|
||||
start_bits[c] |= cd->cbits[c+cbit_digit];
|
||||
set_type_bits(start_bits, cbit_digit, table_limit, cd);
|
||||
try_next = FALSE;
|
||||
break;
|
||||
|
||||
/* The cbit_space table has vertical tab as whitespace; we have to
|
||||
discard it. */
|
||||
ensure it is set as not whitespace. */
|
||||
|
||||
case OP_NOT_WHITESPACE:
|
||||
for (c = 0; c < 32; c++)
|
||||
{
|
||||
int d = cd->cbits[c+cbit_space];
|
||||
if (c == 1) d &= ~0x08;
|
||||
start_bits[c] |= ~d;
|
||||
}
|
||||
set_nottype_bits(start_bits, cbit_space, table_limit, cd);
|
||||
start_bits[1] |= 0x08;
|
||||
try_next = FALSE;
|
||||
break;
|
||||
|
||||
/* The cbit_space table has vertical tab as whitespace; we have to
|
||||
discard it. */
|
||||
not set it from the table. */
|
||||
|
||||
case OP_WHITESPACE:
|
||||
for (c = 0; c < 32; c++)
|
||||
{
|
||||
int d = cd->cbits[c+cbit_space];
|
||||
if (c == 1) d &= ~0x08;
|
||||
start_bits[c] |= d;
|
||||
}
|
||||
c = start_bits[1]; /* Save in case it was already set */
|
||||
set_type_bits(start_bits, cbit_space, table_limit, cd);
|
||||
start_bits[1] = (start_bits[1] & ~0x08) | c;
|
||||
try_next = FALSE;
|
||||
break;
|
||||
|
||||
case OP_NOT_WORDCHAR:
|
||||
for (c = 0; c < 32; c++)
|
||||
start_bits[c] |= ~cd->cbits[c+cbit_word];
|
||||
set_nottype_bits(start_bits, cbit_word, table_limit, cd);
|
||||
try_next = FALSE;
|
||||
break;
|
||||
|
||||
case OP_WORDCHAR:
|
||||
for (c = 0; c < 32; c++)
|
||||
start_bits[c] |= cd->cbits[c+cbit_word];
|
||||
set_type_bits(start_bits, cbit_word, table_limit, cd);
|
||||
try_next = FALSE;
|
||||
break;
|
||||
|
||||
@ -699,6 +826,7 @@ do
|
||||
|
||||
case OP_TYPEPLUS:
|
||||
case OP_TYPEMINPLUS:
|
||||
case OP_TYPEPOSPLUS:
|
||||
tcode++;
|
||||
break;
|
||||
|
||||
@ -722,52 +850,69 @@ do
|
||||
case OP_TYPEPOSQUERY:
|
||||
switch(tcode[1])
|
||||
{
|
||||
default:
|
||||
case OP_ANY:
|
||||
case OP_ALLANY:
|
||||
return SSB_FAIL;
|
||||
|
||||
case OP_HSPACE:
|
||||
SET_BIT(0x09);
|
||||
SET_BIT(0x20);
|
||||
if (utf8)
|
||||
{
|
||||
SET_BIT(0xC2); /* For U+00A0 */
|
||||
SET_BIT(0xE1); /* For U+1680, U+180E */
|
||||
SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
|
||||
SET_BIT(0xE3); /* For U+3000 */
|
||||
}
|
||||
else SET_BIT(0xA0);
|
||||
break;
|
||||
|
||||
case OP_ANYNL:
|
||||
case OP_VSPACE:
|
||||
SET_BIT(0x0A);
|
||||
SET_BIT(0x0B);
|
||||
SET_BIT(0x0C);
|
||||
SET_BIT(0x0D);
|
||||
if (utf8)
|
||||
{
|
||||
SET_BIT(0xC2); /* For U+0085 */
|
||||
SET_BIT(0xE2); /* For U+2028, U+2029 */
|
||||
}
|
||||
else SET_BIT(0x85);
|
||||
break;
|
||||
|
||||
case OP_NOT_DIGIT:
|
||||
for (c = 0; c < 32; c++)
|
||||
start_bits[c] |= ~cd->cbits[c+cbit_digit];
|
||||
set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
|
||||
break;
|
||||
|
||||
case OP_DIGIT:
|
||||
for (c = 0; c < 32; c++)
|
||||
start_bits[c] |= cd->cbits[c+cbit_digit];
|
||||
set_type_bits(start_bits, cbit_digit, table_limit, cd);
|
||||
break;
|
||||
|
||||
/* The cbit_space table has vertical tab as whitespace; we have to
|
||||
discard it. */
|
||||
ensure it gets set as not whitespace. */
|
||||
|
||||
case OP_NOT_WHITESPACE:
|
||||
for (c = 0; c < 32; c++)
|
||||
{
|
||||
int d = cd->cbits[c+cbit_space];
|
||||
if (c == 1) d &= ~0x08;
|
||||
start_bits[c] |= ~d;
|
||||
}
|
||||
set_nottype_bits(start_bits, cbit_space, table_limit, cd);
|
||||
start_bits[1] |= 0x08;
|
||||
break;
|
||||
|
||||
/* The cbit_space table has vertical tab as whitespace; we have to
|
||||
discard it. */
|
||||
avoid setting it. */
|
||||
|
||||
case OP_WHITESPACE:
|
||||
for (c = 0; c < 32; c++)
|
||||
{
|
||||
int d = cd->cbits[c+cbit_space];
|
||||
if (c == 1) d &= ~0x08;
|
||||
start_bits[c] |= d;
|
||||
}
|
||||
c = start_bits[1]; /* Save in case it was already set */
|
||||
set_type_bits(start_bits, cbit_space, table_limit, cd);
|
||||
start_bits[1] = (start_bits[1] & ~0x08) | c;
|
||||
break;
|
||||
|
||||
case OP_NOT_WORDCHAR:
|
||||
for (c = 0; c < 32; c++)
|
||||
start_bits[c] |= ~cd->cbits[c+cbit_word];
|
||||
set_nottype_bits(start_bits, cbit_word, table_limit, cd);
|
||||
break;
|
||||
|
||||
case OP_WORDCHAR:
|
||||
for (c = 0; c < 32; c++)
|
||||
start_bits[c] |= cd->cbits[c+cbit_word];
|
||||
set_type_bits(start_bits, cbit_word, table_limit, cd);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -123,8 +123,10 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
|
||||
#define STRING_Avestan0 STR_A STR_v STR_e STR_s STR_t STR_a STR_n "\0"
|
||||
#define STRING_Balinese0 STR_B STR_a STR_l STR_i STR_n STR_e STR_s STR_e "\0"
|
||||
#define STRING_Bamum0 STR_B STR_a STR_m STR_u STR_m "\0"
|
||||
#define STRING_Batak0 STR_B STR_a STR_t STR_a STR_k "\0"
|
||||
#define STRING_Bengali0 STR_B STR_e STR_n STR_g STR_a STR_l STR_i "\0"
|
||||
#define STRING_Bopomofo0 STR_B STR_o STR_p STR_o STR_m STR_o STR_f STR_o "\0"
|
||||
#define STRING_Brahmi0 STR_B STR_r STR_a STR_h STR_m STR_i "\0"
|
||||
#define STRING_Braille0 STR_B STR_r STR_a STR_i STR_l STR_l STR_e "\0"
|
||||
#define STRING_Buginese0 STR_B STR_u STR_g STR_i STR_n STR_e STR_s STR_e "\0"
|
||||
#define STRING_Buhid0 STR_B STR_u STR_h STR_i STR_d "\0"
|
||||
@ -184,6 +186,7 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
|
||||
#define STRING_Lu0 STR_L STR_u "\0"
|
||||
#define STRING_Lycian0 STR_L STR_y STR_c STR_i STR_a STR_n "\0"
|
||||
#define STRING_Lydian0 STR_L STR_y STR_d STR_i STR_a STR_n "\0"
|
||||
#define STRING_Mandaic0 STR_M STR_a STR_n STR_d STR_a STR_i STR_c "\0"
|
||||
#define STRING_M0 STR_M "\0"
|
||||
#define STRING_Malayalam0 STR_M STR_a STR_l STR_a STR_y STR_a STR_l STR_a STR_m "\0"
|
||||
#define STRING_Mc0 STR_M STR_c "\0"
|
||||
@ -243,6 +246,10 @@ strings to make sure that UTF-8 support works on EBCDIC platforms. */
|
||||
#define STRING_Tifinagh0 STR_T STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0"
|
||||
#define STRING_Ugaritic0 STR_U STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0"
|
||||
#define STRING_Vai0 STR_V STR_a STR_i "\0"
|
||||
#define STRING_Xan0 STR_X STR_a STR_n "\0"
|
||||
#define STRING_Xps0 STR_X STR_p STR_s "\0"
|
||||
#define STRING_Xsp0 STR_X STR_s STR_p "\0"
|
||||
#define STRING_Xwd0 STR_X STR_w STR_d "\0"
|
||||
#define STRING_Yi0 STR_Y STR_i "\0"
|
||||
#define STRING_Z0 STR_Z "\0"
|
||||
#define STRING_Zl0 STR_Z STR_l "\0"
|
||||
@ -256,8 +263,10 @@ const char _pcre_utt_names[] =
|
||||
STRING_Avestan0
|
||||
STRING_Balinese0
|
||||
STRING_Bamum0
|
||||
STRING_Batak0
|
||||
STRING_Bengali0
|
||||
STRING_Bopomofo0
|
||||
STRING_Brahmi0
|
||||
STRING_Braille0
|
||||
STRING_Buginese0
|
||||
STRING_Buhid0
|
||||
@ -319,6 +328,7 @@ const char _pcre_utt_names[] =
|
||||
STRING_Lydian0
|
||||
STRING_M0
|
||||
STRING_Malayalam0
|
||||
STRING_Mandaic0
|
||||
STRING_Mc0
|
||||
STRING_Me0
|
||||
STRING_Meetei_Mayek0
|
||||
@ -376,6 +386,10 @@ const char _pcre_utt_names[] =
|
||||
STRING_Tifinagh0
|
||||
STRING_Ugaritic0
|
||||
STRING_Vai0
|
||||
STRING_Xan0
|
||||
STRING_Xps0
|
||||
STRING_Xsp0
|
||||
STRING_Xwd0
|
||||
STRING_Yi0
|
||||
STRING_Z0
|
||||
STRING_Zl0
|
||||
@ -389,131 +403,138 @@ const ucp_type_table _pcre_utt[] = {
|
||||
{ 20, PT_SC, ucp_Avestan },
|
||||
{ 28, PT_SC, ucp_Balinese },
|
||||
{ 37, PT_SC, ucp_Bamum },
|
||||
{ 43, PT_SC, ucp_Bengali },
|
||||
{ 51, PT_SC, ucp_Bopomofo },
|
||||
{ 60, PT_SC, ucp_Braille },
|
||||
{ 68, PT_SC, ucp_Buginese },
|
||||
{ 77, PT_SC, ucp_Buhid },
|
||||
{ 83, PT_GC, ucp_C },
|
||||
{ 85, PT_SC, ucp_Canadian_Aboriginal },
|
||||
{ 105, PT_SC, ucp_Carian },
|
||||
{ 112, PT_PC, ucp_Cc },
|
||||
{ 115, PT_PC, ucp_Cf },
|
||||
{ 118, PT_SC, ucp_Cham },
|
||||
{ 123, PT_SC, ucp_Cherokee },
|
||||
{ 132, PT_PC, ucp_Cn },
|
||||
{ 135, PT_PC, ucp_Co },
|
||||
{ 138, PT_SC, ucp_Common },
|
||||
{ 145, PT_SC, ucp_Coptic },
|
||||
{ 152, PT_PC, ucp_Cs },
|
||||
{ 155, PT_SC, ucp_Cuneiform },
|
||||
{ 165, PT_SC, ucp_Cypriot },
|
||||
{ 173, PT_SC, ucp_Cyrillic },
|
||||
{ 182, PT_SC, ucp_Deseret },
|
||||
{ 190, PT_SC, ucp_Devanagari },
|
||||
{ 201, PT_SC, ucp_Egyptian_Hieroglyphs },
|
||||
{ 222, PT_SC, ucp_Ethiopic },
|
||||
{ 231, PT_SC, ucp_Georgian },
|
||||
{ 240, PT_SC, ucp_Glagolitic },
|
||||
{ 251, PT_SC, ucp_Gothic },
|
||||
{ 258, PT_SC, ucp_Greek },
|
||||
{ 264, PT_SC, ucp_Gujarati },
|
||||
{ 273, PT_SC, ucp_Gurmukhi },
|
||||
{ 282, PT_SC, ucp_Han },
|
||||
{ 286, PT_SC, ucp_Hangul },
|
||||
{ 293, PT_SC, ucp_Hanunoo },
|
||||
{ 301, PT_SC, ucp_Hebrew },
|
||||
{ 308, PT_SC, ucp_Hiragana },
|
||||
{ 317, PT_SC, ucp_Imperial_Aramaic },
|
||||
{ 334, PT_SC, ucp_Inherited },
|
||||
{ 344, PT_SC, ucp_Inscriptional_Pahlavi },
|
||||
{ 366, PT_SC, ucp_Inscriptional_Parthian },
|
||||
{ 389, PT_SC, ucp_Javanese },
|
||||
{ 398, PT_SC, ucp_Kaithi },
|
||||
{ 405, PT_SC, ucp_Kannada },
|
||||
{ 413, PT_SC, ucp_Katakana },
|
||||
{ 422, PT_SC, ucp_Kayah_Li },
|
||||
{ 431, PT_SC, ucp_Kharoshthi },
|
||||
{ 442, PT_SC, ucp_Khmer },
|
||||
{ 448, PT_GC, ucp_L },
|
||||
{ 450, PT_LAMP, 0 },
|
||||
{ 453, PT_SC, ucp_Lao },
|
||||
{ 457, PT_SC, ucp_Latin },
|
||||
{ 463, PT_SC, ucp_Lepcha },
|
||||
{ 470, PT_SC, ucp_Limbu },
|
||||
{ 476, PT_SC, ucp_Linear_B },
|
||||
{ 485, PT_SC, ucp_Lisu },
|
||||
{ 490, PT_PC, ucp_Ll },
|
||||
{ 493, PT_PC, ucp_Lm },
|
||||
{ 496, PT_PC, ucp_Lo },
|
||||
{ 499, PT_PC, ucp_Lt },
|
||||
{ 502, PT_PC, ucp_Lu },
|
||||
{ 505, PT_SC, ucp_Lycian },
|
||||
{ 512, PT_SC, ucp_Lydian },
|
||||
{ 519, PT_GC, ucp_M },
|
||||
{ 521, PT_SC, ucp_Malayalam },
|
||||
{ 531, PT_PC, ucp_Mc },
|
||||
{ 534, PT_PC, ucp_Me },
|
||||
{ 537, PT_SC, ucp_Meetei_Mayek },
|
||||
{ 550, PT_PC, ucp_Mn },
|
||||
{ 553, PT_SC, ucp_Mongolian },
|
||||
{ 563, PT_SC, ucp_Myanmar },
|
||||
{ 571, PT_GC, ucp_N },
|
||||
{ 573, PT_PC, ucp_Nd },
|
||||
{ 576, PT_SC, ucp_New_Tai_Lue },
|
||||
{ 588, PT_SC, ucp_Nko },
|
||||
{ 592, PT_PC, ucp_Nl },
|
||||
{ 595, PT_PC, ucp_No },
|
||||
{ 598, PT_SC, ucp_Ogham },
|
||||
{ 604, PT_SC, ucp_Ol_Chiki },
|
||||
{ 613, PT_SC, ucp_Old_Italic },
|
||||
{ 624, PT_SC, ucp_Old_Persian },
|
||||
{ 636, PT_SC, ucp_Old_South_Arabian },
|
||||
{ 654, PT_SC, ucp_Old_Turkic },
|
||||
{ 665, PT_SC, ucp_Oriya },
|
||||
{ 671, PT_SC, ucp_Osmanya },
|
||||
{ 679, PT_GC, ucp_P },
|
||||
{ 681, PT_PC, ucp_Pc },
|
||||
{ 684, PT_PC, ucp_Pd },
|
||||
{ 687, PT_PC, ucp_Pe },
|
||||
{ 690, PT_PC, ucp_Pf },
|
||||
{ 693, PT_SC, ucp_Phags_Pa },
|
||||
{ 702, PT_SC, ucp_Phoenician },
|
||||
{ 713, PT_PC, ucp_Pi },
|
||||
{ 716, PT_PC, ucp_Po },
|
||||
{ 719, PT_PC, ucp_Ps },
|
||||
{ 722, PT_SC, ucp_Rejang },
|
||||
{ 729, PT_SC, ucp_Runic },
|
||||
{ 735, PT_GC, ucp_S },
|
||||
{ 737, PT_SC, ucp_Samaritan },
|
||||
{ 747, PT_SC, ucp_Saurashtra },
|
||||
{ 758, PT_PC, ucp_Sc },
|
||||
{ 761, PT_SC, ucp_Shavian },
|
||||
{ 769, PT_SC, ucp_Sinhala },
|
||||
{ 777, PT_PC, ucp_Sk },
|
||||
{ 780, PT_PC, ucp_Sm },
|
||||
{ 783, PT_PC, ucp_So },
|
||||
{ 786, PT_SC, ucp_Sundanese },
|
||||
{ 796, PT_SC, ucp_Syloti_Nagri },
|
||||
{ 809, PT_SC, ucp_Syriac },
|
||||
{ 816, PT_SC, ucp_Tagalog },
|
||||
{ 824, PT_SC, ucp_Tagbanwa },
|
||||
{ 833, PT_SC, ucp_Tai_Le },
|
||||
{ 840, PT_SC, ucp_Tai_Tham },
|
||||
{ 849, PT_SC, ucp_Tai_Viet },
|
||||
{ 858, PT_SC, ucp_Tamil },
|
||||
{ 864, PT_SC, ucp_Telugu },
|
||||
{ 871, PT_SC, ucp_Thaana },
|
||||
{ 878, PT_SC, ucp_Thai },
|
||||
{ 883, PT_SC, ucp_Tibetan },
|
||||
{ 891, PT_SC, ucp_Tifinagh },
|
||||
{ 900, PT_SC, ucp_Ugaritic },
|
||||
{ 909, PT_SC, ucp_Vai },
|
||||
{ 913, PT_SC, ucp_Yi },
|
||||
{ 916, PT_GC, ucp_Z },
|
||||
{ 918, PT_PC, ucp_Zl },
|
||||
{ 921, PT_PC, ucp_Zp },
|
||||
{ 924, PT_PC, ucp_Zs }
|
||||
{ 43, PT_SC, ucp_Batak },
|
||||
{ 49, PT_SC, ucp_Bengali },
|
||||
{ 57, PT_SC, ucp_Bopomofo },
|
||||
{ 66, PT_SC, ucp_Brahmi },
|
||||
{ 73, PT_SC, ucp_Braille },
|
||||
{ 81, PT_SC, ucp_Buginese },
|
||||
{ 90, PT_SC, ucp_Buhid },
|
||||
{ 96, PT_GC, ucp_C },
|
||||
{ 98, PT_SC, ucp_Canadian_Aboriginal },
|
||||
{ 118, PT_SC, ucp_Carian },
|
||||
{ 125, PT_PC, ucp_Cc },
|
||||
{ 128, PT_PC, ucp_Cf },
|
||||
{ 131, PT_SC, ucp_Cham },
|
||||
{ 136, PT_SC, ucp_Cherokee },
|
||||
{ 145, PT_PC, ucp_Cn },
|
||||
{ 148, PT_PC, ucp_Co },
|
||||
{ 151, PT_SC, ucp_Common },
|
||||
{ 158, PT_SC, ucp_Coptic },
|
||||
{ 165, PT_PC, ucp_Cs },
|
||||
{ 168, PT_SC, ucp_Cuneiform },
|
||||
{ 178, PT_SC, ucp_Cypriot },
|
||||
{ 186, PT_SC, ucp_Cyrillic },
|
||||
{ 195, PT_SC, ucp_Deseret },
|
||||
{ 203, PT_SC, ucp_Devanagari },
|
||||
{ 214, PT_SC, ucp_Egyptian_Hieroglyphs },
|
||||
{ 235, PT_SC, ucp_Ethiopic },
|
||||
{ 244, PT_SC, ucp_Georgian },
|
||||
{ 253, PT_SC, ucp_Glagolitic },
|
||||
{ 264, PT_SC, ucp_Gothic },
|
||||
{ 271, PT_SC, ucp_Greek },
|
||||
{ 277, PT_SC, ucp_Gujarati },
|
||||
{ 286, PT_SC, ucp_Gurmukhi },
|
||||
{ 295, PT_SC, ucp_Han },
|
||||
{ 299, PT_SC, ucp_Hangul },
|
||||
{ 306, PT_SC, ucp_Hanunoo },
|
||||
{ 314, PT_SC, ucp_Hebrew },
|
||||
{ 321, PT_SC, ucp_Hiragana },
|
||||
{ 330, PT_SC, ucp_Imperial_Aramaic },
|
||||
{ 347, PT_SC, ucp_Inherited },
|
||||
{ 357, PT_SC, ucp_Inscriptional_Pahlavi },
|
||||
{ 379, PT_SC, ucp_Inscriptional_Parthian },
|
||||
{ 402, PT_SC, ucp_Javanese },
|
||||
{ 411, PT_SC, ucp_Kaithi },
|
||||
{ 418, PT_SC, ucp_Kannada },
|
||||
{ 426, PT_SC, ucp_Katakana },
|
||||
{ 435, PT_SC, ucp_Kayah_Li },
|
||||
{ 444, PT_SC, ucp_Kharoshthi },
|
||||
{ 455, PT_SC, ucp_Khmer },
|
||||
{ 461, PT_GC, ucp_L },
|
||||
{ 463, PT_LAMP, 0 },
|
||||
{ 466, PT_SC, ucp_Lao },
|
||||
{ 470, PT_SC, ucp_Latin },
|
||||
{ 476, PT_SC, ucp_Lepcha },
|
||||
{ 483, PT_SC, ucp_Limbu },
|
||||
{ 489, PT_SC, ucp_Linear_B },
|
||||
{ 498, PT_SC, ucp_Lisu },
|
||||
{ 503, PT_PC, ucp_Ll },
|
||||
{ 506, PT_PC, ucp_Lm },
|
||||
{ 509, PT_PC, ucp_Lo },
|
||||
{ 512, PT_PC, ucp_Lt },
|
||||
{ 515, PT_PC, ucp_Lu },
|
||||
{ 518, PT_SC, ucp_Lycian },
|
||||
{ 525, PT_SC, ucp_Lydian },
|
||||
{ 532, PT_GC, ucp_M },
|
||||
{ 534, PT_SC, ucp_Malayalam },
|
||||
{ 544, PT_SC, ucp_Mandaic },
|
||||
{ 552, PT_PC, ucp_Mc },
|
||||
{ 555, PT_PC, ucp_Me },
|
||||
{ 558, PT_SC, ucp_Meetei_Mayek },
|
||||
{ 571, PT_PC, ucp_Mn },
|
||||
{ 574, PT_SC, ucp_Mongolian },
|
||||
{ 584, PT_SC, ucp_Myanmar },
|
||||
{ 592, PT_GC, ucp_N },
|
||||
{ 594, PT_PC, ucp_Nd },
|
||||
{ 597, PT_SC, ucp_New_Tai_Lue },
|
||||
{ 609, PT_SC, ucp_Nko },
|
||||
{ 613, PT_PC, ucp_Nl },
|
||||
{ 616, PT_PC, ucp_No },
|
||||
{ 619, PT_SC, ucp_Ogham },
|
||||
{ 625, PT_SC, ucp_Ol_Chiki },
|
||||
{ 634, PT_SC, ucp_Old_Italic },
|
||||
{ 645, PT_SC, ucp_Old_Persian },
|
||||
{ 657, PT_SC, ucp_Old_South_Arabian },
|
||||
{ 675, PT_SC, ucp_Old_Turkic },
|
||||
{ 686, PT_SC, ucp_Oriya },
|
||||
{ 692, PT_SC, ucp_Osmanya },
|
||||
{ 700, PT_GC, ucp_P },
|
||||
{ 702, PT_PC, ucp_Pc },
|
||||
{ 705, PT_PC, ucp_Pd },
|
||||
{ 708, PT_PC, ucp_Pe },
|
||||
{ 711, PT_PC, ucp_Pf },
|
||||
{ 714, PT_SC, ucp_Phags_Pa },
|
||||
{ 723, PT_SC, ucp_Phoenician },
|
||||
{ 734, PT_PC, ucp_Pi },
|
||||
{ 737, PT_PC, ucp_Po },
|
||||
{ 740, PT_PC, ucp_Ps },
|
||||
{ 743, PT_SC, ucp_Rejang },
|
||||
{ 750, PT_SC, ucp_Runic },
|
||||
{ 756, PT_GC, ucp_S },
|
||||
{ 758, PT_SC, ucp_Samaritan },
|
||||
{ 768, PT_SC, ucp_Saurashtra },
|
||||
{ 779, PT_PC, ucp_Sc },
|
||||
{ 782, PT_SC, ucp_Shavian },
|
||||
{ 790, PT_SC, ucp_Sinhala },
|
||||
{ 798, PT_PC, ucp_Sk },
|
||||
{ 801, PT_PC, ucp_Sm },
|
||||
{ 804, PT_PC, ucp_So },
|
||||
{ 807, PT_SC, ucp_Sundanese },
|
||||
{ 817, PT_SC, ucp_Syloti_Nagri },
|
||||
{ 830, PT_SC, ucp_Syriac },
|
||||
{ 837, PT_SC, ucp_Tagalog },
|
||||
{ 845, PT_SC, ucp_Tagbanwa },
|
||||
{ 854, PT_SC, ucp_Tai_Le },
|
||||
{ 861, PT_SC, ucp_Tai_Tham },
|
||||
{ 870, PT_SC, ucp_Tai_Viet },
|
||||
{ 879, PT_SC, ucp_Tamil },
|
||||
{ 885, PT_SC, ucp_Telugu },
|
||||
{ 892, PT_SC, ucp_Thaana },
|
||||
{ 899, PT_SC, ucp_Thai },
|
||||
{ 904, PT_SC, ucp_Tibetan },
|
||||
{ 912, PT_SC, ucp_Tifinagh },
|
||||
{ 921, PT_SC, ucp_Ugaritic },
|
||||
{ 930, PT_SC, ucp_Vai },
|
||||
{ 934, PT_ALNUM, 0 },
|
||||
{ 938, PT_PXSPACE, 0 },
|
||||
{ 942, PT_SPACE, 0 },
|
||||
{ 946, PT_WORD, 0 },
|
||||
{ 950, PT_SC, ucp_Yi },
|
||||
{ 953, PT_GC, ucp_Z },
|
||||
{ 955, PT_PC, ucp_Zl },
|
||||
{ 958, PT_PC, ucp_Zp },
|
||||
{ 961, PT_PC, ucp_Zs }
|
||||
};
|
||||
|
||||
const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table);
|
||||
|
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
Copyright (c) 1997-2010 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -104,6 +104,7 @@ while ((t = *data++) != XCL_END)
|
||||
else /* XCL_PROP & XCL_NOTPROP */
|
||||
{
|
||||
int chartype = UCD_CHARTYPE(c);
|
||||
|
||||
switch(*data)
|
||||
{
|
||||
case PT_ANY:
|
||||
@ -111,12 +112,13 @@ while ((t = *data++) != XCL_END)
|
||||
break;
|
||||
|
||||
case PT_LAMP:
|
||||
if ((chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt) ==
|
||||
(t == XCL_PROP)) return !negated;
|
||||
if ((chartype == ucp_Lu || chartype == ucp_Ll ||
|
||||
chartype == ucp_Lt) == (t == XCL_PROP)) return !negated;
|
||||
break;
|
||||
|
||||
case PT_GC:
|
||||
if ((data[1] == _pcre_ucp_gentype[chartype]) == (t == XCL_PROP)) return !negated;
|
||||
if ((data[1] == _pcre_ucp_gentype[chartype]) == (t == XCL_PROP))
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
case PT_PC:
|
||||
@ -127,6 +129,33 @@ while ((t = *data++) != XCL_END)
|
||||
if ((data[1] == UCD_SCRIPT(c)) == (t == XCL_PROP)) return !negated;
|
||||
break;
|
||||
|
||||
case PT_ALNUM:
|
||||
if ((_pcre_ucp_gentype[chartype] == ucp_L ||
|
||||
_pcre_ucp_gentype[chartype] == ucp_N) == (t == XCL_PROP))
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
case PT_SPACE: /* Perl space */
|
||||
if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
|
||||
c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
|
||||
== (t == XCL_PROP))
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
case PT_PXSPACE: /* POSIX space */
|
||||
if ((_pcre_ucp_gentype[chartype] == ucp_Z ||
|
||||
c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
|
||||
c == CHAR_FF || c == CHAR_CR) == (t == XCL_PROP))
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
case PT_WORD:
|
||||
if ((_pcre_ucp_gentype[chartype] == ucp_L ||
|
||||
_pcre_ucp_gentype[chartype] == ucp_N || c == CHAR_UNDERSCORE)
|
||||
== (t == XCL_PROP))
|
||||
return !negated;
|
||||
break;
|
||||
|
||||
/* This should never occur, but compilers may mutter if there is no
|
||||
default. */
|
||||
|
||||
|
@ -150,7 +150,10 @@ enum {
|
||||
ucp_Old_Turkic = G_UNICODE_SCRIPT_OLD_TURKIC,
|
||||
ucp_Samaritan = G_UNICODE_SCRIPT_SAMARITAN,
|
||||
ucp_Tai_Tham = G_UNICODE_SCRIPT_TAI_THAM,
|
||||
ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET
|
||||
ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET,
|
||||
ucp_Batak = G_UNICODE_SCRIPT_BATAK,
|
||||
ucp_Brahmi = G_UNICODE_SCRIPT_BRAHMI,
|
||||
ucp_Mandaic = G_UNICODE_SCRIPT_MANDAIC
|
||||
};
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user