mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2024-11-10 19:36:18 +01:00
Update the include pcre to 8.02
This commit is contained in:
parent
b0b7aeffc0
commit
85621f1a0f
@ -53,7 +53,6 @@ libpcre_la_SOURCES = \
|
||||
pcre.h \
|
||||
pcre_internal.h \
|
||||
ucp.h \
|
||||
ucpinternal.h \
|
||||
$(libpcre_headers)
|
||||
|
||||
libpcre_la_LIBADD = $(DEP_LIBS)
|
||||
|
@ -5,7 +5,7 @@
|
||||
/* This is the public header file for the PCRE library, to be #included by
|
||||
applications that call the PCRE functions.
|
||||
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -41,10 +41,10 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
/* The current PCRE version information. */
|
||||
|
||||
#define PCRE_MAJOR 7
|
||||
#define PCRE_MINOR 8
|
||||
#define PCRE_MAJOR 8
|
||||
#define PCRE_MINOR 02
|
||||
#define PCRE_PRERELEASE
|
||||
#define PCRE_DATE 2008-09-05
|
||||
#define PCRE_DATE 2010-03-19
|
||||
|
||||
/* When an application links to a PCRE DLL in Windows, the symbols that are
|
||||
imported have to be identified as such. When building PCRE, the appropriate
|
||||
@ -95,7 +95,8 @@ it is needed here for malloc. */
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Options */
|
||||
/* Options. Some are compile-time only, some are run-time only, and some are
|
||||
both, so we keep them all distinct. */
|
||||
|
||||
#define PCRE_CASELESS 0x00000001
|
||||
#define PCRE_MULTILINE 0x00000002
|
||||
@ -112,7 +113,8 @@ extern "C" {
|
||||
#define PCRE_NO_AUTO_CAPTURE 0x00001000
|
||||
#define PCRE_NO_UTF8_CHECK 0x00002000
|
||||
#define PCRE_AUTO_CALLOUT 0x00004000
|
||||
#define PCRE_PARTIAL 0x00008000
|
||||
#define PCRE_PARTIAL_SOFT 0x00008000
|
||||
#define PCRE_PARTIAL 0x00008000 /* Backwards compatible synonym */
|
||||
#define PCRE_DFA_SHORTEST 0x00010000
|
||||
#define PCRE_DFA_RESTART 0x00020000
|
||||
#define PCRE_FIRSTLINE 0x00040000
|
||||
@ -125,6 +127,10 @@ extern "C" {
|
||||
#define PCRE_BSR_ANYCRLF 0x00800000
|
||||
#define PCRE_BSR_UNICODE 0x01000000
|
||||
#define PCRE_JAVASCRIPT_COMPAT 0x02000000
|
||||
#define PCRE_NO_START_OPTIMIZE 0x04000000
|
||||
#define PCRE_NO_START_OPTIMISE 0x04000000
|
||||
#define PCRE_PARTIAL_HARD 0x08000000
|
||||
#define PCRE_NOTEMPTY_ATSTART 0x10000000
|
||||
|
||||
/* Exec-time and get/set-time error codes */
|
||||
|
||||
@ -171,6 +177,7 @@ extern "C" {
|
||||
#define PCRE_INFO_OKPARTIAL 12
|
||||
#define PCRE_INFO_JCHANGED 13
|
||||
#define PCRE_INFO_HASCRORLF 14
|
||||
#define PCRE_INFO_MINLENGTH 15
|
||||
|
||||
/* Request types for pcre_config(). Do not re-arrange, in order to remain
|
||||
compatible. */
|
||||
@ -250,7 +257,7 @@ typedef struct pcre_callout_block {
|
||||
#define pcre_free g_free
|
||||
#define pcre_stack_malloc g_try_malloc
|
||||
|
||||
PCRE_EXP_DECL int (*pcre_callout)(pcre_callout_block *);
|
||||
int (*pcre_callout)(pcre_callout_block *);
|
||||
|
||||
/* Exported PCRE functions */
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -104,11 +104,11 @@ switch (what)
|
||||
break;
|
||||
|
||||
case PCRE_CONFIG_MATCH_LIMIT:
|
||||
*((unsigned int *)where) = MATCH_LIMIT;
|
||||
*((unsigned long int *)where) = MATCH_LIMIT;
|
||||
break;
|
||||
|
||||
case PCRE_CONFIG_MATCH_LIMIT_RECURSION:
|
||||
*((unsigned int *)where) = MATCH_LIMIT_RECURSION;
|
||||
*((unsigned long int *)where) = MATCH_LIMIT_RECURSION;
|
||||
break;
|
||||
|
||||
case PCRE_CONFIG_STACKRECURSE:
|
||||
|
@ -3,10 +3,11 @@
|
||||
*************************************************/
|
||||
|
||||
/* PCRE is a library of functions to support regular expressions whose syntax
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
and semantics are as close as possible to those of the Perl 5 language (but see
|
||||
below for why this module is different).
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
Copyright (c) 1997-2010 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -44,6 +45,34 @@ FSM). This is NOT Perl- compatible, but it has advantages in certain
|
||||
applications. */
|
||||
|
||||
|
||||
/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
|
||||
the performance of his patterns greatly. I could not use it as it stood, as it
|
||||
was not thread safe, and made assumptions about pattern sizes. Also, it caused
|
||||
test 7 to loop, and test 9 to crash with a segfault.
|
||||
|
||||
The issue is the check for duplicate states, which is done by a simple linear
|
||||
search up the state list. (Grep for "duplicate" below to find the code.) For
|
||||
many patterns, there will never be many states active at one time, so a simple
|
||||
linear search is fine. In patterns that have many active states, it might be a
|
||||
bottleneck. The suggested code used an indexing scheme to remember which states
|
||||
had previously been used for each character, and avoided the linear search when
|
||||
it knew there was no chance of a duplicate. This was implemented when adding
|
||||
states to the state lists.
|
||||
|
||||
I wrote some thread-safe, not-limited code to try something similar at the time
|
||||
of checking for duplicates (instead of when adding states), using index vectors
|
||||
on the stack. It did give a 13% improvement with one specially constructed
|
||||
pattern for certain subject strings, but on other strings and on many of the
|
||||
simpler patterns in the test suite it did worse. The major problem, I think,
|
||||
was the extra time to initialize the index. This had to be done for each call
|
||||
of internal_dfa_exec(). (The supplied patch used a static vector, initialized
|
||||
only once - I suspect this was the cause of the problems with the tests.)
|
||||
|
||||
Overall, I concluded that the gains in some cases did not outweigh the losses
|
||||
in others, so I abandoned this code. */
|
||||
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
@ -60,7 +89,6 @@ applications. */
|
||||
#define SP " "
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Code parameters and static tables *
|
||||
*************************************************/
|
||||
@ -81,16 +109,18 @@ never stored, so we push them well clear of the normal opcodes. */
|
||||
character that is to be tested in some way. This makes is possible to
|
||||
centralize the loading of these characters. In the case of Type * etc, the
|
||||
"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
|
||||
small value. ***NOTE*** If the start of this table is modified, the two tables
|
||||
that follow must also be modified. */
|
||||
small value. Non-zero values in the table are the offsets from the opcode where
|
||||
the character is to be found. ***NOTE*** If the start of this table is
|
||||
modified, the three tables that follow must also be modified. */
|
||||
|
||||
static const uschar coptable[] = {
|
||||
0, /* End */
|
||||
0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
|
||||
0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
|
||||
0, 0, 0, /* Any, AllAny, Anybyte */
|
||||
0, 0, 0, /* NOTPROP, PROP, EXTUNI */
|
||||
0, 0, /* \P, \p */
|
||||
0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
|
||||
0, /* \X */
|
||||
0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
|
||||
1, /* Char */
|
||||
1, /* Charnc */
|
||||
@ -127,12 +157,69 @@ static const uschar coptable[] = {
|
||||
0, /* Reverse */
|
||||
0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
|
||||
0, 0, 0, /* SBRA, SCBRA, SCOND */
|
||||
0, /* CREF */
|
||||
0, /* RREF */
|
||||
0, 0, /* CREF, NCREF */
|
||||
0, 0, /* RREF, NRREF */
|
||||
0, /* DEF */
|
||||
0, 0, /* BRAZERO, BRAMINZERO */
|
||||
0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
|
||||
0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
|
||||
0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
|
||||
};
|
||||
|
||||
/* This table identifies those opcodes that inspect a character. It is used to
|
||||
remember the fact that a character could have been inspected when the end of
|
||||
the subject is reached. ***NOTE*** If the start of this table is modified, the
|
||||
two tables that follow must also be modified. */
|
||||
|
||||
static const uschar poptable[] = {
|
||||
0, /* End */
|
||||
0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
|
||||
1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
|
||||
1, 1, 1, /* Any, AllAny, Anybyte */
|
||||
1, 1, /* \P, \p */
|
||||
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
|
||||
1, /* \X */
|
||||
0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
|
||||
1, /* Char */
|
||||
1, /* Charnc */
|
||||
1, /* not */
|
||||
/* Positive single-char repeats */
|
||||
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
|
||||
1, 1, 1, /* upto, minupto, exact */
|
||||
1, 1, 1, 1, /* *+, ++, ?+, upto+ */
|
||||
/* Negative single-char repeats - only for chars < 256 */
|
||||
1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
|
||||
1, 1, 1, /* NOT upto, minupto, exact */
|
||||
1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
|
||||
/* Positive type repeats */
|
||||
1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
|
||||
1, 1, 1, /* Type upto, minupto, exact */
|
||||
1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
|
||||
/* Character class & ref repeats */
|
||||
1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
|
||||
1, 1, /* CRRANGE, CRMINRANGE */
|
||||
1, /* CLASS */
|
||||
1, /* NCLASS */
|
||||
1, /* XCLASS - variable length */
|
||||
0, /* REF */
|
||||
0, /* RECURSE */
|
||||
0, /* CALLOUT */
|
||||
0, /* Alt */
|
||||
0, /* Ket */
|
||||
0, /* KetRmax */
|
||||
0, /* KetRmin */
|
||||
0, /* Assert */
|
||||
0, /* Assert not */
|
||||
0, /* Assert behind */
|
||||
0, /* Assert behind not */
|
||||
0, /* Reverse */
|
||||
0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
|
||||
0, 0, 0, /* SBRA, SCBRA, SCOND */
|
||||
0, 0, /* CREF, NCREF */
|
||||
0, 0, /* RREF, NRREF */
|
||||
0, /* DEF */
|
||||
0, 0, /* BRAZERO, BRAMINZERO */
|
||||
0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
|
||||
0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
|
||||
};
|
||||
|
||||
/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
|
||||
@ -170,7 +257,7 @@ typedef struct stateblock {
|
||||
#define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
|
||||
|
||||
|
||||
#ifdef DEBUG
|
||||
#ifdef PCRE_DEBUG
|
||||
/*************************************************
|
||||
* Print character string *
|
||||
*************************************************/
|
||||
@ -390,6 +477,11 @@ if (*first_op == OP_REVERSE)
|
||||
current_subject -= gone_back;
|
||||
}
|
||||
|
||||
/* Save the earliest consulted character */
|
||||
|
||||
if (current_subject < md->start_used_ptr)
|
||||
md->start_used_ptr = current_subject;
|
||||
|
||||
/* Now we can process the individual branches. */
|
||||
|
||||
end_code = this_start_code;
|
||||
@ -454,6 +546,8 @@ for (;;)
|
||||
int i, j;
|
||||
int clen, dlen;
|
||||
unsigned int c, d;
|
||||
int forced_fail = 0;
|
||||
BOOL could_continue = FALSE;
|
||||
|
||||
/* Make the new state list into the active state list and empty the
|
||||
new state list. */
|
||||
@ -467,7 +561,7 @@ for (;;)
|
||||
workspace[0] ^= 1; /* Remember for the restarting feature */
|
||||
workspace[1] = active_count;
|
||||
|
||||
#ifdef DEBUG
|
||||
#ifdef PCRE_DEBUG
|
||||
printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
|
||||
pchars((uschar *)ptr, strlen((char *)ptr), stdout);
|
||||
printf("\"\n");
|
||||
@ -511,9 +605,9 @@ for (;;)
|
||||
stateblock *current_state = active_states + i;
|
||||
const uschar *code;
|
||||
int state_offset = current_state->offset;
|
||||
int count, codevalue;
|
||||
int count, codevalue, rrc;
|
||||
|
||||
#ifdef DEBUG
|
||||
#ifdef PCRE_DEBUG
|
||||
printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
|
||||
if (clen == 0) printf("EOL\n");
|
||||
else if (c > 32 && c < 127) printf("'%c'\n", c);
|
||||
@ -543,7 +637,9 @@ for (;;)
|
||||
}
|
||||
}
|
||||
|
||||
/* Check for a duplicate state with the same count, and skip if found. */
|
||||
/* Check for a duplicate state with the same count, and skip if found.
|
||||
See the note at the head of this module about the possibility of improving
|
||||
performance here. */
|
||||
|
||||
for (j = 0; j < i; j++)
|
||||
{
|
||||
@ -560,6 +656,12 @@ for (;;)
|
||||
code = start_code + state_offset;
|
||||
codevalue = *code;
|
||||
|
||||
/* If this opcode inspects a character, but we are at the end of the
|
||||
subject, remember the fact for use when testing for a partial match. */
|
||||
|
||||
if (clen == 0 && poptable[codevalue] != 0)
|
||||
could_continue = TRUE;
|
||||
|
||||
/* If this opcode is followed by an inline character, load it. It is
|
||||
tempting to test for the presence of a subject character here, but that
|
||||
is wrong, because sometimes zero repetitions of the subject are
|
||||
@ -606,11 +708,24 @@ for (;;)
|
||||
|
||||
switch (codevalue)
|
||||
{
|
||||
/* ========================================================================== */
|
||||
/* These cases are never obeyed. This is a fudge that causes a compile-
|
||||
time error if the vectors coptable or poptable, which are indexed by
|
||||
opcode, are not the correct length. It seems to be the only way to do
|
||||
such a check at compile time, as the sizeof() operator does not work
|
||||
in the C preprocessor. */
|
||||
|
||||
case OP_TABLE_LENGTH:
|
||||
case OP_TABLE_LENGTH +
|
||||
((sizeof(coptable) == OP_TABLE_LENGTH) &&
|
||||
(sizeof(poptable) == OP_TABLE_LENGTH)):
|
||||
break;
|
||||
|
||||
/* ========================================================================== */
|
||||
/* Reached a closing bracket. If not at the end of the pattern, carry
|
||||
on with the next opcode. Otherwise, unless we have an empty string and
|
||||
PCRE_NOTEMPTY is set, save the match data, shifting up all previous
|
||||
PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
|
||||
start of the subject, save the match data, shifting up all previous
|
||||
matches so we always have the longest first. */
|
||||
|
||||
case OP_KET:
|
||||
@ -624,26 +739,32 @@ for (;;)
|
||||
ADD_ACTIVE(state_offset - GET(code, 1), 0);
|
||||
}
|
||||
}
|
||||
else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
|
||||
else
|
||||
{
|
||||
if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
|
||||
else if (match_count > 0 && ++match_count * 2 >= offsetcount)
|
||||
match_count = 0;
|
||||
count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
|
||||
if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
|
||||
if (offsetcount >= 2)
|
||||
if (ptr > current_subject ||
|
||||
((md->moptions & PCRE_NOTEMPTY) == 0 &&
|
||||
((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
|
||||
current_subject > start_subject + md->start_offset)))
|
||||
{
|
||||
offsets[0] = current_subject - start_subject;
|
||||
offsets[1] = ptr - start_subject;
|
||||
DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
|
||||
offsets[1] - offsets[0], current_subject));
|
||||
}
|
||||
if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
|
||||
{
|
||||
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
|
||||
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
|
||||
match_count, rlevel*2-2, SP));
|
||||
return match_count;
|
||||
if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
|
||||
else if (match_count > 0 && ++match_count * 2 >= offsetcount)
|
||||
match_count = 0;
|
||||
count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
|
||||
if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
|
||||
if (offsetcount >= 2)
|
||||
{
|
||||
offsets[0] = current_subject - start_subject;
|
||||
offsets[1] = ptr - start_subject;
|
||||
DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
|
||||
offsets[1] - offsets[0], current_subject));
|
||||
}
|
||||
if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
|
||||
{
|
||||
DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
|
||||
"%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
|
||||
match_count, rlevel*2-2, SP));
|
||||
return match_count;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -757,7 +878,7 @@ for (;;)
|
||||
if ((md->moptions & PCRE_NOTEOL) == 0)
|
||||
{
|
||||
if (clen == 0 ||
|
||||
(IS_NEWLINE(ptr) &&
|
||||
((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
|
||||
((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
|
||||
))
|
||||
{ ADD_ACTIVE(state_offset + 1, 0); }
|
||||
@ -794,6 +915,7 @@ for (;;)
|
||||
if (ptr > start_subject)
|
||||
{
|
||||
const uschar *temp = ptr - 1;
|
||||
if (temp < md->start_used_ptr) md->start_used_ptr = temp;
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8) BACKCHAR(temp);
|
||||
#endif
|
||||
@ -802,8 +924,9 @@ for (;;)
|
||||
}
|
||||
else left_word = 0;
|
||||
|
||||
if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
|
||||
else right_word = 0;
|
||||
if (clen > 0)
|
||||
right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
|
||||
else right_word = 0;
|
||||
|
||||
if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
|
||||
{ ADD_ACTIVE(state_offset + 1, 0); }
|
||||
@ -2157,11 +2280,12 @@ for (;;)
|
||||
|
||||
/* ========================================================================== */
|
||||
/* These are the opcodes for fancy brackets of various kinds. We have
|
||||
to use recursion in order to handle them. The "always failing" assersion
|
||||
(?!) is optimised when compiling to OP_FAIL, so we have to support that,
|
||||
to use recursion in order to handle them. The "always failing" assertion
|
||||
(?!) is optimised to OP_FAIL when compiling, so we have to support that,
|
||||
though the other "backtracking verbs" are not supported. */
|
||||
|
||||
case OP_FAIL:
|
||||
forced_fail++; /* Count FAILs for multiple states */
|
||||
break;
|
||||
|
||||
case OP_ASSERT:
|
||||
@ -2189,6 +2313,7 @@ for (;;)
|
||||
rlevel, /* function recursion level */
|
||||
recursing); /* pass on regex recursion */
|
||||
|
||||
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
|
||||
if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
|
||||
{ ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
|
||||
}
|
||||
@ -2200,29 +2325,60 @@ for (;;)
|
||||
{
|
||||
int local_offsets[1000];
|
||||
int local_workspace[1000];
|
||||
int condcode = code[LINK_SIZE+1];
|
||||
int codelink = GET(code, 1);
|
||||
int condcode;
|
||||
|
||||
/* Because of the way auto-callout works during compile, a callout item
|
||||
is inserted between OP_COND and an assertion condition. This does not
|
||||
happen for the other conditions. */
|
||||
|
||||
if (code[LINK_SIZE+1] == OP_CALLOUT)
|
||||
{
|
||||
rrc = 0;
|
||||
if (pcre_callout != NULL)
|
||||
{
|
||||
pcre_callout_block cb;
|
||||
cb.version = 1; /* Version 1 of the callout block */
|
||||
cb.callout_number = code[LINK_SIZE+2];
|
||||
cb.offset_vector = offsets;
|
||||
cb.subject = (PCRE_SPTR)start_subject;
|
||||
cb.subject_length = end_subject - start_subject;
|
||||
cb.start_match = current_subject - start_subject;
|
||||
cb.current_position = ptr - start_subject;
|
||||
cb.pattern_position = GET(code, LINK_SIZE + 3);
|
||||
cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
|
||||
cb.capture_top = 1;
|
||||
cb.capture_last = -1;
|
||||
cb.callout_data = md->callout_data;
|
||||
if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
|
||||
}
|
||||
if (rrc > 0) break; /* Fail this thread */
|
||||
code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
|
||||
}
|
||||
|
||||
condcode = code[LINK_SIZE+1];
|
||||
|
||||
/* Back reference conditions are not supported */
|
||||
|
||||
if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
|
||||
if (condcode == OP_CREF || condcode == OP_NCREF)
|
||||
return PCRE_ERROR_DFA_UCOND;
|
||||
|
||||
/* The DEFINE condition is always false */
|
||||
|
||||
if (condcode == OP_DEF)
|
||||
{
|
||||
ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
|
||||
}
|
||||
{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
|
||||
|
||||
/* The only supported version of OP_RREF is for the value RREF_ANY,
|
||||
which means "test if in any recursion". We can't test for specifically
|
||||
recursed groups. */
|
||||
|
||||
else if (condcode == OP_RREF)
|
||||
else if (condcode == OP_RREF || condcode == OP_NRREF)
|
||||
{
|
||||
int value = GET2(code, LINK_SIZE+2);
|
||||
if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
|
||||
if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
|
||||
else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
|
||||
if (recursing > 0)
|
||||
{ ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
|
||||
else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
|
||||
}
|
||||
|
||||
/* Otherwise, the condition is an assertion */
|
||||
@ -2248,11 +2404,12 @@ for (;;)
|
||||
rlevel, /* function recursion level */
|
||||
recursing); /* pass on regex recursion */
|
||||
|
||||
if (rc == PCRE_ERROR_DFA_UITEM) return rc;
|
||||
if ((rc >= 0) ==
|
||||
(condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
|
||||
{ ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
|
||||
else
|
||||
{ ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
|
||||
{ ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -2404,9 +2561,9 @@ for (;;)
|
||||
/* Handle callouts */
|
||||
|
||||
case OP_CALLOUT:
|
||||
rrc = 0;
|
||||
if (pcre_callout != NULL)
|
||||
{
|
||||
int rrc;
|
||||
pcre_callout_block cb;
|
||||
cb.version = 1; /* Version 1 of the callout block */
|
||||
cb.callout_number = code[1];
|
||||
@ -2421,8 +2578,9 @@ for (;;)
|
||||
cb.capture_last = -1;
|
||||
cb.callout_data = md->callout_data;
|
||||
if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
|
||||
if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
|
||||
}
|
||||
if (rrc == 0)
|
||||
{ ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
|
||||
break;
|
||||
|
||||
|
||||
@ -2438,19 +2596,33 @@ for (;;)
|
||||
/* We have finished the processing at the current subject character. If no
|
||||
new states have been set for the next character, we have found all the
|
||||
matches that we are going to find. If we are at the top level and partial
|
||||
matching has been requested, check for appropriate conditions. */
|
||||
matching has been requested, check for appropriate conditions.
|
||||
|
||||
The "forced_ fail" variable counts the number of (*F) encountered for the
|
||||
character. If it is equal to the original active_count (saved in
|
||||
workspace[1]) it means that (*F) was found on every active state. In this
|
||||
case we don't want to give a partial match.
|
||||
|
||||
The "could_continue" variable is true if a state could have continued but
|
||||
for the fact that the end of the subject was reached. */
|
||||
|
||||
if (new_count <= 0)
|
||||
{
|
||||
if (match_count < 0 && /* No matches found */
|
||||
rlevel == 1 && /* Top level match function */
|
||||
(md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
|
||||
ptr >= end_subject && /* Reached end of subject */
|
||||
ptr > current_subject) /* Matched non-empty string */
|
||||
if (rlevel == 1 && /* Top level, and */
|
||||
could_continue && /* Some could go on */
|
||||
forced_fail != workspace[1] && /* Not all forced fail & */
|
||||
( /* either... */
|
||||
(md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
|
||||
|| /* or... */
|
||||
((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
|
||||
match_count < 0) /* no matches */
|
||||
) && /* And... */
|
||||
ptr >= end_subject && /* Reached end of subject */
|
||||
ptr > current_subject) /* Matched non-empty string */
|
||||
{
|
||||
if (offsetcount >= 2)
|
||||
{
|
||||
offsets[0] = current_subject - start_subject;
|
||||
offsets[0] = md->start_used_ptr - start_subject;
|
||||
offsets[1] = end_subject - start_subject;
|
||||
}
|
||||
match_count = PCRE_ERROR_PARTIAL;
|
||||
@ -2592,6 +2764,7 @@ md->start_code = (const uschar *)argument_re +
|
||||
re->name_table_offset + re->name_count * re->name_entry_size;
|
||||
md->start_subject = (const unsigned char *)subject;
|
||||
md->end_subject = end_subject;
|
||||
md->start_offset = start_offset;
|
||||
md->moptions = options;
|
||||
md->poptions = re->options;
|
||||
|
||||
@ -2614,10 +2787,10 @@ switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)option
|
||||
PCRE_NEWLINE_BITS)
|
||||
{
|
||||
case 0: newline = NEWLINE; break; /* Compile-time default */
|
||||
case PCRE_NEWLINE_CR: newline = '\r'; break;
|
||||
case PCRE_NEWLINE_LF: newline = '\n'; break;
|
||||
case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
|
||||
case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
|
||||
case PCRE_NEWLINE_CR+
|
||||
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
|
||||
PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
|
||||
case PCRE_NEWLINE_ANY: newline = -1; break;
|
||||
case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
|
||||
default: return PCRE_ERROR_BADNEWLINE;
|
||||
@ -2696,8 +2869,8 @@ if (!anchored)
|
||||
}
|
||||
else
|
||||
{
|
||||
if (startline && study != NULL &&
|
||||
(study->options & PCRE_STUDY_MAPPED) != 0)
|
||||
if (!startline && study != NULL &&
|
||||
(study->flags & PCRE_STUDY_MAPPED) != 0)
|
||||
start_bits = study->start_bits;
|
||||
}
|
||||
}
|
||||
@ -2713,9 +2886,8 @@ if ((re->flags & PCRE_REQCHSET) != 0)
|
||||
}
|
||||
|
||||
/* Call the main matching function, looping for a non-anchored regex after a
|
||||
failed match. Unless restarting, optimize by moving to the first match
|
||||
character if possible, when not anchored. Then unless wanting a partial match,
|
||||
check for a required later character. */
|
||||
failed match. If not restarting, perform certain optimizations at the start of
|
||||
a match. */
|
||||
|
||||
for (;;)
|
||||
{
|
||||
@ -2725,11 +2897,10 @@ for (;;)
|
||||
{
|
||||
const uschar *save_end_subject = end_subject;
|
||||
|
||||
/* Advance to a unique first char if possible. If firstline is TRUE, the
|
||||
start of the match is constrained to the first line of a multiline string.
|
||||
Implement this by temporarily adjusting end_subject so that we stop
|
||||
scanning at a newline. If the match fails at the newline, later code breaks
|
||||
this loop. */
|
||||
/* If firstline is TRUE, the start of the match is constrained to the first
|
||||
line of a multiline string. Implement this by temporarily adjusting
|
||||
end_subject so that we stop scanning at a newline. If the match fails at
|
||||
the newline, later code breaks this loop. */
|
||||
|
||||
if (firstline)
|
||||
{
|
||||
@ -2749,126 +2920,151 @@ for (;;)
|
||||
end_subject = t;
|
||||
}
|
||||
|
||||
if (first_byte >= 0)
|
||||
{
|
||||
if (first_byte_caseless)
|
||||
while (current_subject < end_subject &&
|
||||
lcc[*current_subject] != first_byte)
|
||||
current_subject++;
|
||||
else
|
||||
while (current_subject < end_subject && *current_subject != first_byte)
|
||||
current_subject++;
|
||||
}
|
||||
/* There are some optimizations that avoid running the match if a known
|
||||
starting point is not found. However, there is an option that disables
|
||||
these, for testing and for ensuring that all callouts do actually occur. */
|
||||
|
||||
/* Or to just after a linebreak for a multiline match if possible */
|
||||
|
||||
else if (startline)
|
||||
if ((options & PCRE_NO_START_OPTIMIZE) == 0)
|
||||
{
|
||||
if (current_subject > md->start_subject + start_offset)
|
||||
/* Advance to a known first byte. */
|
||||
|
||||
if (first_byte >= 0)
|
||||
{
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8)
|
||||
{
|
||||
while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
|
||||
{
|
||||
if (first_byte_caseless)
|
||||
while (current_subject < end_subject &&
|
||||
lcc[*current_subject] != first_byte)
|
||||
current_subject++;
|
||||
while(current_subject < end_subject &&
|
||||
(*current_subject & 0xc0) == 0x80)
|
||||
current_subject++;
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
|
||||
current_subject++;
|
||||
|
||||
/* If we have just passed a CR and the newline option is ANY or
|
||||
ANYCRLF, and we are now at a LF, advance the match position by one more
|
||||
character. */
|
||||
|
||||
if (current_subject[-1] == '\r' &&
|
||||
(md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
|
||||
current_subject < end_subject &&
|
||||
*current_subject == '\n')
|
||||
current_subject++;
|
||||
while (current_subject < end_subject &&
|
||||
*current_subject != first_byte)
|
||||
current_subject++;
|
||||
}
|
||||
}
|
||||
|
||||
/* Or to a non-unique first char after study */
|
||||
/* Or to just after a linebreak for a multiline match if possible */
|
||||
|
||||
else if (start_bits != NULL)
|
||||
{
|
||||
while (current_subject < end_subject)
|
||||
else if (startline)
|
||||
{
|
||||
register unsigned int c = *current_subject;
|
||||
if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
|
||||
else break;
|
||||
if (current_subject > md->start_subject + start_offset)
|
||||
{
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8)
|
||||
{
|
||||
while (current_subject < end_subject &&
|
||||
!WAS_NEWLINE(current_subject))
|
||||
{
|
||||
current_subject++;
|
||||
while(current_subject < end_subject &&
|
||||
(*current_subject & 0xc0) == 0x80)
|
||||
current_subject++;
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
|
||||
current_subject++;
|
||||
|
||||
/* If we have just passed a CR and the newline option is ANY or
|
||||
ANYCRLF, and we are now at a LF, advance the match position by one
|
||||
more character. */
|
||||
|
||||
if (current_subject[-1] == CHAR_CR &&
|
||||
(md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
|
||||
current_subject < end_subject &&
|
||||
*current_subject == CHAR_NL)
|
||||
current_subject++;
|
||||
}
|
||||
}
|
||||
|
||||
/* Or to a non-unique first char after study */
|
||||
|
||||
else if (start_bits != NULL)
|
||||
{
|
||||
while (current_subject < end_subject)
|
||||
{
|
||||
register unsigned int c = *current_subject;
|
||||
if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
|
||||
else break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Restore fudged end_subject */
|
||||
|
||||
end_subject = save_end_subject;
|
||||
}
|
||||
|
||||
/* If req_byte is set, we know that that character must appear in the subject
|
||||
for the match to succeed. If the first character is set, req_byte must be
|
||||
later in the subject; otherwise the test starts at the match point. This
|
||||
optimization can save a huge amount of work in patterns with nested unlimited
|
||||
repeats that aren't going to match. Writing separate code for cased/caseless
|
||||
versions makes it go faster, as does using an autoincrement and backing off
|
||||
on a match.
|
||||
/* The following two optimizations are disabled for partial matching or if
|
||||
disabling is explicitly requested (and of course, by the test above, this
|
||||
code is not obeyed when restarting after a partial match). */
|
||||
|
||||
HOWEVER: when the subject string is very, very long, searching to its end can
|
||||
take a long time, and give bad performance on quite ordinary patterns. This
|
||||
showed up when somebody was matching /^C/ on a 32-megabyte string... so we
|
||||
don't do this when the string is sufficiently long.
|
||||
|
||||
ALSO: this processing is disabled when partial matching is requested.
|
||||
*/
|
||||
|
||||
if (req_byte >= 0 &&
|
||||
end_subject - current_subject < REQ_BYTE_MAX &&
|
||||
(options & PCRE_PARTIAL) == 0)
|
||||
{
|
||||
register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
|
||||
|
||||
/* We don't need to repeat the search if we haven't yet reached the
|
||||
place we found it at last time. */
|
||||
|
||||
if (p > req_byte_ptr)
|
||||
if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
|
||||
(options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
|
||||
{
|
||||
if (req_byte_caseless)
|
||||
/* If the pattern was studied, a minimum subject length may be set. This
|
||||
is a lower bound; no actual string of that length may actually match the
|
||||
pattern. Although the value is, strictly, in characters, we treat it as
|
||||
bytes to avoid spending too much time in this optimization. */
|
||||
|
||||
if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
|
||||
(pcre_uint32)(end_subject - current_subject) < study->minlength)
|
||||
return PCRE_ERROR_NOMATCH;
|
||||
|
||||
/* If req_byte is set, we know that that character must appear in the
|
||||
subject for the match to succeed. If the first character is set, req_byte
|
||||
must be later in the subject; otherwise the test starts at the match
|
||||
point. This optimization can save a huge amount of work in patterns with
|
||||
nested unlimited repeats that aren't going to match. Writing separate
|
||||
code for cased/caseless versions makes it go faster, as does using an
|
||||
autoincrement and backing off on a match.
|
||||
|
||||
HOWEVER: when the subject string is very, very long, searching to its end
|
||||
can take a long time, and give bad performance on quite ordinary
|
||||
patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
|
||||
string... so we don't do this when the string is sufficiently long. */
|
||||
|
||||
if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
|
||||
{
|
||||
while (p < end_subject)
|
||||
register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
|
||||
|
||||
/* We don't need to repeat the search if we haven't yet reached the
|
||||
place we found it at last time. */
|
||||
|
||||
if (p > req_byte_ptr)
|
||||
{
|
||||
register int pp = *p++;
|
||||
if (pp == req_byte || pp == req_byte2) { p--; break; }
|
||||
if (req_byte_caseless)
|
||||
{
|
||||
while (p < end_subject)
|
||||
{
|
||||
register int pp = *p++;
|
||||
if (pp == req_byte || pp == req_byte2) { p--; break; }
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
while (p < end_subject)
|
||||
{
|
||||
if (*p++ == req_byte) { p--; break; }
|
||||
}
|
||||
}
|
||||
|
||||
/* If we can't find the required character, break the matching loop,
|
||||
which will cause a return or PCRE_ERROR_NOMATCH. */
|
||||
|
||||
if (p >= end_subject) break;
|
||||
|
||||
/* If we have found the required character, save the point where we
|
||||
found it, so that we don't search again next time round the loop if
|
||||
the start hasn't passed this character yet. */
|
||||
|
||||
req_byte_ptr = p;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
while (p < end_subject)
|
||||
{
|
||||
if (*p++ == req_byte) { p--; break; }
|
||||
}
|
||||
}
|
||||
|
||||
/* If we can't find the required character, break the matching loop,
|
||||
which will cause a return or PCRE_ERROR_NOMATCH. */
|
||||
|
||||
if (p >= end_subject) break;
|
||||
|
||||
/* If we have found the required character, save the point where we
|
||||
found it, so that we don't search again next time round the loop if
|
||||
the start hasn't passed this character yet. */
|
||||
|
||||
req_byte_ptr = p;
|
||||
}
|
||||
}
|
||||
} /* End of optimizations that are done when not restarting */
|
||||
|
||||
/* OK, now we can do the business */
|
||||
|
||||
md->start_used_ptr = current_subject;
|
||||
|
||||
rc = internal_dfa_exec(
|
||||
md, /* fixed match data */
|
||||
md->start_code, /* this subexpression's code */
|
||||
@ -2903,9 +3099,9 @@ for (;;)
|
||||
not contain any explicit matches for \r or \n, and the newline option is CRLF
|
||||
or ANY or ANYCRLF, advance the match position by one more character. */
|
||||
|
||||
if (current_subject[-1] == '\r' &&
|
||||
if (current_subject[-1] == CHAR_CR &&
|
||||
current_subject < end_subject &&
|
||||
*current_subject == '\n' &&
|
||||
*current_subject == CHAR_NL &&
|
||||
(re->flags & PCRE_HASCRORLF) == 0 &&
|
||||
(md->nltype == NLTYPE_ANY ||
|
||||
md->nltype == NLTYPE_ANYCRLF ||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -89,7 +89,7 @@ if (re->magic_number != MAGIC_NUMBER)
|
||||
switch (what)
|
||||
{
|
||||
case PCRE_INFO_OPTIONS:
|
||||
*((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
|
||||
*((unsigned long int *)where) = re->options & PUBLIC_COMPILE_OPTIONS;
|
||||
break;
|
||||
|
||||
case PCRE_INFO_SIZE:
|
||||
@ -119,10 +119,16 @@ switch (what)
|
||||
|
||||
case PCRE_INFO_FIRSTTABLE:
|
||||
*((const uschar **)where) =
|
||||
(study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
|
||||
(study != NULL && (study->flags & PCRE_STUDY_MAPPED) != 0)?
|
||||
((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
|
||||
break;
|
||||
|
||||
case PCRE_INFO_MINLENGTH:
|
||||
*((int *)where) =
|
||||
(study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0)?
|
||||
study->minlength : -1;
|
||||
break;
|
||||
|
||||
case PCRE_INFO_LASTLITERAL:
|
||||
*((int *)where) =
|
||||
((re->flags & PCRE_REQCHSET) != 0)? re->req_byte : -1;
|
||||
@ -144,6 +150,9 @@ switch (what)
|
||||
*((const uschar **)where) = (const uschar *)(_pcre_default_tables);
|
||||
break;
|
||||
|
||||
/* From release 8.00 this will always return TRUE because NOPARTIAL is
|
||||
no longer ever set (the restrictions have been removed). */
|
||||
|
||||
case PCRE_INFO_OKPARTIAL:
|
||||
*((int *)where) = (re->flags & PCRE_NOPARTIAL) == 0;
|
||||
break;
|
||||
|
@ -43,8 +43,14 @@ PCRE is thread-clean and doesn't use any global variables in the normal sense.
|
||||
However, it calls memory allocation and freeing functions via the four
|
||||
indirections below, and it can optionally do callouts, using the fifth
|
||||
indirection. These values can be changed by the caller, but are shared between
|
||||
all threads. However, when compiling for Virtual Pascal, things are done
|
||||
differently, and global variables are not used (see pcre.in). */
|
||||
all threads.
|
||||
|
||||
For MS Visual Studio and Symbian OS, there are problems in initializing these
|
||||
variables to non-local functions. In these cases, therefore, an indirection via
|
||||
a local function is used.
|
||||
|
||||
Also, when compiling for Virtual Pascal, things are done differently, and
|
||||
global variables are not used. */
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
@ -52,6 +58,19 @@ differently, and global variables are not used (see pcre.in). */
|
||||
|
||||
#include "pcre_internal.h"
|
||||
|
||||
#if defined _MSC_VER || defined __SYMBIAN32__
|
||||
static void* LocalPcreMalloc(size_t aSize)
|
||||
{
|
||||
return malloc(aSize);
|
||||
}
|
||||
static void LocalPcreFree(void* aPtr)
|
||||
{
|
||||
free(aPtr);
|
||||
}
|
||||
PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL;
|
||||
|
||||
#elif !defined VPCOMPAT
|
||||
PCRE_EXP_DATA_DEFN int (*pcre_callout)(pcre_callout_block *) = NULL;
|
||||
#endif
|
||||
|
||||
/* End of pcre_globals.c */
|
||||
|
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -83,7 +83,7 @@ if (re->magic_number != MAGIC_NUMBER)
|
||||
re = _pcre_try_flipped(re, &internal_re, NULL, NULL);
|
||||
if (re == NULL) return PCRE_ERROR_BADMAGIC;
|
||||
}
|
||||
if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
|
||||
if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_COMPILE_OPTIONS);
|
||||
if (first_byte != NULL)
|
||||
*first_byte = ((re->flags & PCRE_FIRSTSET) != 0)? re->first_byte :
|
||||
((re->flags & PCRE_STARTLINE) != 0)? -1 : -2;
|
||||
|
@ -7,7 +7,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
Copyright (c) 1997-2010 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -45,10 +45,24 @@ functions whose names all begin with "_pcre_". */
|
||||
#ifndef PCRE_INTERNAL_H
|
||||
#define PCRE_INTERNAL_H
|
||||
|
||||
/* Define DEBUG to get debugging output on stdout. */
|
||||
/* Define PCRE_DEBUG to get debugging output on stdout. */
|
||||
|
||||
#if 0
|
||||
#define DEBUG
|
||||
#define PCRE_DEBUG
|
||||
#endif
|
||||
|
||||
/* We do not support both EBCDIC and UTF-8 at the same time. The "configure"
|
||||
script prevents both being selected, but not everybody uses "configure". */
|
||||
|
||||
#if defined EBCDIC && defined SUPPORT_UTF8
|
||||
#error The use of both EBCDIC and SUPPORT_UTF8 is not supported.
|
||||
#endif
|
||||
|
||||
/* If SUPPORT_UCP is defined, SUPPORT_UTF8 must also be defined. The
|
||||
"configure" script ensures this, but not everybody uses "configure". */
|
||||
|
||||
#if defined SUPPORT_UCP && !defined SUPPORT_UTF8
|
||||
#define SUPPORT_UTF8 1
|
||||
#endif
|
||||
|
||||
/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
|
||||
@ -60,7 +74,7 @@ It turns out that the Mac Debugging.h header also defines the macro DPRINTF, so
|
||||
be absolutely sure we get our version. */
|
||||
|
||||
#undef DPRINTF
|
||||
#ifdef DEBUG
|
||||
#ifdef PCRE_DEBUG
|
||||
#define DPRINTF(p) printf p
|
||||
#else
|
||||
#define DPRINTF(p) /* Nothing */
|
||||
@ -72,8 +86,6 @@ setjmp and stdarg are used is when NO_RECURSE is set. */
|
||||
|
||||
#include <ctype.h>
|
||||
#include <limits.h>
|
||||
#include <setjmp.h>
|
||||
#include <stdarg.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
@ -172,6 +184,26 @@ preprocessor time in standard C environments. */
|
||||
#error Cannot determine a type for 32-bit unsigned integers
|
||||
#endif
|
||||
|
||||
/* When checking for integer overflow in pcre_compile(), we need to handle
|
||||
large integers. If a 64-bit integer type is available, we can use that.
|
||||
Otherwise we have to cast to double, which of course requires floating point
|
||||
arithmetic. Handle this by defining a macro for the appropriate type. If
|
||||
stdint.h is available, include it; it may define INT64_MAX. Systems that do not
|
||||
have stdint.h (e.g. Solaris) may have inttypes.h. The macro int64_t may be set
|
||||
by "configure". */
|
||||
|
||||
#if HAVE_STDINT_H
|
||||
#include <stdint.h>
|
||||
#elif HAVE_INTTYPES_H
|
||||
#include <inttypes.h>
|
||||
#endif
|
||||
|
||||
#if defined INT64_MAX || defined int64_t
|
||||
#define INT64_OR_DOUBLE int64_t
|
||||
#else
|
||||
#define INT64_OR_DOUBLE double
|
||||
#endif
|
||||
|
||||
/* All character handling must be done as unsigned characters. Otherwise there
|
||||
are problems with top-bit-set characters and functions such as isspace().
|
||||
However, we leave the interface to the outside world as char *, because that
|
||||
@ -259,6 +291,7 @@ option on the command line. */
|
||||
#define strncmp(s1,s2,m) _strncmp(s1,s2,m)
|
||||
#define memcmp(s,c,n) _memcmp(s,c,n)
|
||||
#define memcpy(d,s,n) _memcpy(d,s,n)
|
||||
#define memmove(d,s,n) _memmove(d,s,n)
|
||||
#define memset(s,c,n) _memset(s,c,n)
|
||||
#else /* VPCOMPAT */
|
||||
|
||||
@ -477,6 +510,26 @@ if there are extra bytes. This is called when we know we are in UTF-8 mode. */
|
||||
len += gcaa; \
|
||||
}
|
||||
|
||||
/* Get the next UTF-8 character, testing for UTF-8 mode, not advancing the
|
||||
pointer, incrementing length if there are extra bytes. This is called when we
|
||||
know we are in UTF-8 mode. */
|
||||
|
||||
#define GETCHARLENTEST(c, eptr, len) \
|
||||
c = *eptr; \
|
||||
if (utf8 && c >= 0xc0) \
|
||||
{ \
|
||||
int gcii; \
|
||||
int gcaa = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
|
||||
int gcss = 6*gcaa; \
|
||||
c = (c & _pcre_utf8_table3[gcaa]) << gcss; \
|
||||
for (gcii = 1; gcii <= gcaa; gcii++) \
|
||||
{ \
|
||||
gcss -= 6; \
|
||||
c |= (eptr[gcii] & 0x3f) << gcss; \
|
||||
} \
|
||||
len += gcaa; \
|
||||
}
|
||||
|
||||
/* If the pointer is not at the start of a character, move it back until
|
||||
it is. This is called only in UTF-8 mode - we don't put a test within the macro
|
||||
because almost all calls are already within a block of UTF-8 only code. */
|
||||
@ -500,7 +553,9 @@ Standard C system should have one. */
|
||||
|
||||
/* Private flags containing information about the compiled regex. They used to
|
||||
live at the top end of the options word, but that got almost full, so now they
|
||||
are in a 16-bit flags word. */
|
||||
are in a 16-bit flags word. From release 8.00, PCRE_NOPARTIAL is unused, as
|
||||
the restrictions on partial matching have been lifted. It remains for backwards
|
||||
compatibility. */
|
||||
|
||||
#define PCRE_NOPARTIAL 0x0001 /* can't use partial with this regex */
|
||||
#define PCRE_FIRSTSET 0x0002 /* first_byte is set */
|
||||
@ -512,6 +567,7 @@ are in a 16-bit flags word. */
|
||||
/* Options for the "extra" block produced by pcre_study(). */
|
||||
|
||||
#define PCRE_STUDY_MAPPED 0x01 /* a map of starting chars exists */
|
||||
#define PCRE_STUDY_MINLEN 0x02 /* a minimum length field exists */
|
||||
|
||||
/* Masks for identifying the public options that are permitted at compile
|
||||
time, run time, or study time, respectively. */
|
||||
@ -519,7 +575,7 @@ time, run time, or study time, respectively. */
|
||||
#define PCRE_NEWLINE_BITS (PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_ANY| \
|
||||
PCRE_NEWLINE_ANYCRLF)
|
||||
|
||||
#define PUBLIC_OPTIONS \
|
||||
#define PUBLIC_COMPILE_OPTIONS \
|
||||
(PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
|
||||
PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY|PCRE_UTF8| \
|
||||
PCRE_NO_AUTO_CAPTURE|PCRE_NO_UTF8_CHECK|PCRE_AUTO_CALLOUT|PCRE_FIRSTLINE| \
|
||||
@ -527,13 +583,15 @@ time, run time, or study time, respectively. */
|
||||
PCRE_JAVASCRIPT_COMPAT)
|
||||
|
||||
#define PUBLIC_EXEC_OPTIONS \
|
||||
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
|
||||
PCRE_PARTIAL|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)
|
||||
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
|
||||
PCRE_NO_UTF8_CHECK|PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_NEWLINE_BITS| \
|
||||
PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE|PCRE_NO_START_OPTIMIZE)
|
||||
|
||||
#define PUBLIC_DFA_EXEC_OPTIONS \
|
||||
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NO_UTF8_CHECK| \
|
||||
PCRE_PARTIAL|PCRE_DFA_SHORTEST|PCRE_DFA_RESTART|PCRE_NEWLINE_BITS| \
|
||||
PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)
|
||||
(PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY|PCRE_NOTEMPTY_ATSTART| \
|
||||
PCRE_NO_UTF8_CHECK|PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_SHORTEST| \
|
||||
PCRE_DFA_RESTART|PCRE_NEWLINE_BITS|PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE| \
|
||||
PCRE_NO_START_OPTIMIZE)
|
||||
|
||||
#define PUBLIC_STUDY_OPTIONS 0 /* None defined */
|
||||
|
||||
@ -559,33 +617,566 @@ variable-length repeat, or a anything other than literal characters. */
|
||||
#define REQ_VARY 0x0200 /* reqbyte followed non-literal item */
|
||||
|
||||
/* Miscellaneous definitions. The #ifndef is to pacify compiler warnings in
|
||||
environments where these macros are defined elsewhere. */
|
||||
environments where these macros are defined elsewhere. Unfortunately, there
|
||||
is no way to do the same for the typedef. */
|
||||
|
||||
typedef gboolean BOOL;
|
||||
typedef gboolean BOOL;
|
||||
|
||||
/* If PCRE is to support UTF-8 on EBCDIC platforms, we cannot use normal
|
||||
character constants like '*' because the compiler would emit their EBCDIC code,
|
||||
which is different from their ASCII/UTF-8 code. Instead we define macros for
|
||||
the characters so that they always use the ASCII/UTF-8 code when UTF-8 support
|
||||
is enabled. When UTF-8 support is not enabled, the definitions use character
|
||||
literals. Both character and string versions of each character are needed, and
|
||||
there are some longer strings as well.
|
||||
|
||||
This means that, on EBCDIC platforms, the PCRE library can handle either
|
||||
EBCDIC, or UTF-8, but not both. To support both in the same compiled library
|
||||
would need different lookups depending on whether PCRE_UTF8 was set or not.
|
||||
This would make it impossible to use characters in switch/case statements,
|
||||
which would reduce performance. For a theoretical use (which nobody has asked
|
||||
for) in a minority area (EBCDIC platforms), this is not sensible. Any
|
||||
application that did need both could compile two versions of the library, using
|
||||
macros to give the functions distinct names. */
|
||||
|
||||
#ifndef SUPPORT_UTF8
|
||||
|
||||
/* UTF-8 support is not enabled; use the platform-dependent character literals
|
||||
so that PCRE works on both ASCII and EBCDIC platforms, in non-UTF-mode only. */
|
||||
|
||||
#define CHAR_HT '\t'
|
||||
#define CHAR_VT '\v'
|
||||
#define CHAR_FF '\f'
|
||||
#define CHAR_CR '\r'
|
||||
#define CHAR_NL '\n'
|
||||
#define CHAR_BS '\b'
|
||||
#define CHAR_BEL '\a'
|
||||
#ifdef EBCDIC
|
||||
#define CHAR_ESC '\047'
|
||||
#define CHAR_DEL '\007'
|
||||
#else
|
||||
#define CHAR_ESC '\033'
|
||||
#define CHAR_DEL '\177'
|
||||
#endif
|
||||
|
||||
#define CHAR_SPACE ' '
|
||||
#define CHAR_EXCLAMATION_MARK '!'
|
||||
#define CHAR_QUOTATION_MARK '"'
|
||||
#define CHAR_NUMBER_SIGN '#'
|
||||
#define CHAR_DOLLAR_SIGN '$'
|
||||
#define CHAR_PERCENT_SIGN '%'
|
||||
#define CHAR_AMPERSAND '&'
|
||||
#define CHAR_APOSTROPHE '\''
|
||||
#define CHAR_LEFT_PARENTHESIS '('
|
||||
#define CHAR_RIGHT_PARENTHESIS ')'
|
||||
#define CHAR_ASTERISK '*'
|
||||
#define CHAR_PLUS '+'
|
||||
#define CHAR_COMMA ','
|
||||
#define CHAR_MINUS '-'
|
||||
#define CHAR_DOT '.'
|
||||
#define CHAR_SLASH '/'
|
||||
#define CHAR_0 '0'
|
||||
#define CHAR_1 '1'
|
||||
#define CHAR_2 '2'
|
||||
#define CHAR_3 '3'
|
||||
#define CHAR_4 '4'
|
||||
#define CHAR_5 '5'
|
||||
#define CHAR_6 '6'
|
||||
#define CHAR_7 '7'
|
||||
#define CHAR_8 '8'
|
||||
#define CHAR_9 '9'
|
||||
#define CHAR_COLON ':'
|
||||
#define CHAR_SEMICOLON ';'
|
||||
#define CHAR_LESS_THAN_SIGN '<'
|
||||
#define CHAR_EQUALS_SIGN '='
|
||||
#define CHAR_GREATER_THAN_SIGN '>'
|
||||
#define CHAR_QUESTION_MARK '?'
|
||||
#define CHAR_COMMERCIAL_AT '@'
|
||||
#define CHAR_A 'A'
|
||||
#define CHAR_B 'B'
|
||||
#define CHAR_C 'C'
|
||||
#define CHAR_D 'D'
|
||||
#define CHAR_E 'E'
|
||||
#define CHAR_F 'F'
|
||||
#define CHAR_G 'G'
|
||||
#define CHAR_H 'H'
|
||||
#define CHAR_I 'I'
|
||||
#define CHAR_J 'J'
|
||||
#define CHAR_K 'K'
|
||||
#define CHAR_L 'L'
|
||||
#define CHAR_M 'M'
|
||||
#define CHAR_N 'N'
|
||||
#define CHAR_O 'O'
|
||||
#define CHAR_P 'P'
|
||||
#define CHAR_Q 'Q'
|
||||
#define CHAR_R 'R'
|
||||
#define CHAR_S 'S'
|
||||
#define CHAR_T 'T'
|
||||
#define CHAR_U 'U'
|
||||
#define CHAR_V 'V'
|
||||
#define CHAR_W 'W'
|
||||
#define CHAR_X 'X'
|
||||
#define CHAR_Y 'Y'
|
||||
#define CHAR_Z 'Z'
|
||||
#define CHAR_LEFT_SQUARE_BRACKET '['
|
||||
#define CHAR_BACKSLASH '\\'
|
||||
#define CHAR_RIGHT_SQUARE_BRACKET ']'
|
||||
#define CHAR_CIRCUMFLEX_ACCENT '^'
|
||||
#define CHAR_UNDERSCORE '_'
|
||||
#define CHAR_GRAVE_ACCENT '`'
|
||||
#define CHAR_a 'a'
|
||||
#define CHAR_b 'b'
|
||||
#define CHAR_c 'c'
|
||||
#define CHAR_d 'd'
|
||||
#define CHAR_e 'e'
|
||||
#define CHAR_f 'f'
|
||||
#define CHAR_g 'g'
|
||||
#define CHAR_h 'h'
|
||||
#define CHAR_i 'i'
|
||||
#define CHAR_j 'j'
|
||||
#define CHAR_k 'k'
|
||||
#define CHAR_l 'l'
|
||||
#define CHAR_m 'm'
|
||||
#define CHAR_n 'n'
|
||||
#define CHAR_o 'o'
|
||||
#define CHAR_p 'p'
|
||||
#define CHAR_q 'q'
|
||||
#define CHAR_r 'r'
|
||||
#define CHAR_s 's'
|
||||
#define CHAR_t 't'
|
||||
#define CHAR_u 'u'
|
||||
#define CHAR_v 'v'
|
||||
#define CHAR_w 'w'
|
||||
#define CHAR_x 'x'
|
||||
#define CHAR_y 'y'
|
||||
#define CHAR_z 'z'
|
||||
#define CHAR_LEFT_CURLY_BRACKET '{'
|
||||
#define CHAR_VERTICAL_LINE '|'
|
||||
#define CHAR_RIGHT_CURLY_BRACKET '}'
|
||||
#define CHAR_TILDE '~'
|
||||
|
||||
#define STR_HT "\t"
|
||||
#define STR_VT "\v"
|
||||
#define STR_FF "\f"
|
||||
#define STR_CR "\r"
|
||||
#define STR_NL "\n"
|
||||
#define STR_BS "\b"
|
||||
#define STR_BEL "\a"
|
||||
#ifdef EBCDIC
|
||||
#define STR_ESC "\047"
|
||||
#define STR_DEL "\007"
|
||||
#else
|
||||
#define STR_ESC "\033"
|
||||
#define STR_DEL "\177"
|
||||
#endif
|
||||
|
||||
#define STR_SPACE " "
|
||||
#define STR_EXCLAMATION_MARK "!"
|
||||
#define STR_QUOTATION_MARK "\""
|
||||
#define STR_NUMBER_SIGN "#"
|
||||
#define STR_DOLLAR_SIGN "$"
|
||||
#define STR_PERCENT_SIGN "%"
|
||||
#define STR_AMPERSAND "&"
|
||||
#define STR_APOSTROPHE "'"
|
||||
#define STR_LEFT_PARENTHESIS "("
|
||||
#define STR_RIGHT_PARENTHESIS ")"
|
||||
#define STR_ASTERISK "*"
|
||||
#define STR_PLUS "+"
|
||||
#define STR_COMMA ","
|
||||
#define STR_MINUS "-"
|
||||
#define STR_DOT "."
|
||||
#define STR_SLASH "/"
|
||||
#define STR_0 "0"
|
||||
#define STR_1 "1"
|
||||
#define STR_2 "2"
|
||||
#define STR_3 "3"
|
||||
#define STR_4 "4"
|
||||
#define STR_5 "5"
|
||||
#define STR_6 "6"
|
||||
#define STR_7 "7"
|
||||
#define STR_8 "8"
|
||||
#define STR_9 "9"
|
||||
#define STR_COLON ":"
|
||||
#define STR_SEMICOLON ";"
|
||||
#define STR_LESS_THAN_SIGN "<"
|
||||
#define STR_EQUALS_SIGN "="
|
||||
#define STR_GREATER_THAN_SIGN ">"
|
||||
#define STR_QUESTION_MARK "?"
|
||||
#define STR_COMMERCIAL_AT "@"
|
||||
#define STR_A "A"
|
||||
#define STR_B "B"
|
||||
#define STR_C "C"
|
||||
#define STR_D "D"
|
||||
#define STR_E "E"
|
||||
#define STR_F "F"
|
||||
#define STR_G "G"
|
||||
#define STR_H "H"
|
||||
#define STR_I "I"
|
||||
#define STR_J "J"
|
||||
#define STR_K "K"
|
||||
#define STR_L "L"
|
||||
#define STR_M "M"
|
||||
#define STR_N "N"
|
||||
#define STR_O "O"
|
||||
#define STR_P "P"
|
||||
#define STR_Q "Q"
|
||||
#define STR_R "R"
|
||||
#define STR_S "S"
|
||||
#define STR_T "T"
|
||||
#define STR_U "U"
|
||||
#define STR_V "V"
|
||||
#define STR_W "W"
|
||||
#define STR_X "X"
|
||||
#define STR_Y "Y"
|
||||
#define STR_Z "Z"
|
||||
#define STR_LEFT_SQUARE_BRACKET "["
|
||||
#define STR_BACKSLASH "\\"
|
||||
#define STR_RIGHT_SQUARE_BRACKET "]"
|
||||
#define STR_CIRCUMFLEX_ACCENT "^"
|
||||
#define STR_UNDERSCORE "_"
|
||||
#define STR_GRAVE_ACCENT "`"
|
||||
#define STR_a "a"
|
||||
#define STR_b "b"
|
||||
#define STR_c "c"
|
||||
#define STR_d "d"
|
||||
#define STR_e "e"
|
||||
#define STR_f "f"
|
||||
#define STR_g "g"
|
||||
#define STR_h "h"
|
||||
#define STR_i "i"
|
||||
#define STR_j "j"
|
||||
#define STR_k "k"
|
||||
#define STR_l "l"
|
||||
#define STR_m "m"
|
||||
#define STR_n "n"
|
||||
#define STR_o "o"
|
||||
#define STR_p "p"
|
||||
#define STR_q "q"
|
||||
#define STR_r "r"
|
||||
#define STR_s "s"
|
||||
#define STR_t "t"
|
||||
#define STR_u "u"
|
||||
#define STR_v "v"
|
||||
#define STR_w "w"
|
||||
#define STR_x "x"
|
||||
#define STR_y "y"
|
||||
#define STR_z "z"
|
||||
#define STR_LEFT_CURLY_BRACKET "{"
|
||||
#define STR_VERTICAL_LINE "|"
|
||||
#define STR_RIGHT_CURLY_BRACKET "}"
|
||||
#define STR_TILDE "~"
|
||||
|
||||
#define STRING_ACCEPT0 "ACCEPT\0"
|
||||
#define STRING_COMMIT0 "COMMIT\0"
|
||||
#define STRING_F0 "F\0"
|
||||
#define STRING_FAIL0 "FAIL\0"
|
||||
#define STRING_PRUNE0 "PRUNE\0"
|
||||
#define STRING_SKIP0 "SKIP\0"
|
||||
#define STRING_THEN "THEN"
|
||||
|
||||
#define STRING_alpha0 "alpha\0"
|
||||
#define STRING_lower0 "lower\0"
|
||||
#define STRING_upper0 "upper\0"
|
||||
#define STRING_alnum0 "alnum\0"
|
||||
#define STRING_ascii0 "ascii\0"
|
||||
#define STRING_blank0 "blank\0"
|
||||
#define STRING_cntrl0 "cntrl\0"
|
||||
#define STRING_digit0 "digit\0"
|
||||
#define STRING_graph0 "graph\0"
|
||||
#define STRING_print0 "print\0"
|
||||
#define STRING_punct0 "punct\0"
|
||||
#define STRING_space0 "space\0"
|
||||
#define STRING_word0 "word\0"
|
||||
#define STRING_xdigit "xdigit"
|
||||
|
||||
#define STRING_DEFINE "DEFINE"
|
||||
|
||||
#define STRING_CR_RIGHTPAR "CR)"
|
||||
#define STRING_LF_RIGHTPAR "LF)"
|
||||
#define STRING_CRLF_RIGHTPAR "CRLF)"
|
||||
#define STRING_ANY_RIGHTPAR "ANY)"
|
||||
#define STRING_ANYCRLF_RIGHTPAR "ANYCRLF)"
|
||||
#define STRING_BSR_ANYCRLF_RIGHTPAR "BSR_ANYCRLF)"
|
||||
#define STRING_BSR_UNICODE_RIGHTPAR "BSR_UNICODE)"
|
||||
#define STRING_UTF8_RIGHTPAR "UTF8)"
|
||||
|
||||
#else /* SUPPORT_UTF8 */
|
||||
|
||||
/* UTF-8 support is enabled; always use UTF-8 (=ASCII) character codes. This
|
||||
works in both modes non-EBCDIC platforms, and on EBCDIC platforms in UTF-8 mode
|
||||
only. */
|
||||
|
||||
#define CHAR_HT '\011'
|
||||
#define CHAR_VT '\013'
|
||||
#define CHAR_FF '\014'
|
||||
#define CHAR_CR '\015'
|
||||
#define CHAR_NL '\012'
|
||||
#define CHAR_BS '\010'
|
||||
#define CHAR_BEL '\007'
|
||||
#define CHAR_ESC '\033'
|
||||
#define CHAR_DEL '\177'
|
||||
|
||||
#define CHAR_SPACE '\040'
|
||||
#define CHAR_EXCLAMATION_MARK '\041'
|
||||
#define CHAR_QUOTATION_MARK '\042'
|
||||
#define CHAR_NUMBER_SIGN '\043'
|
||||
#define CHAR_DOLLAR_SIGN '\044'
|
||||
#define CHAR_PERCENT_SIGN '\045'
|
||||
#define CHAR_AMPERSAND '\046'
|
||||
#define CHAR_APOSTROPHE '\047'
|
||||
#define CHAR_LEFT_PARENTHESIS '\050'
|
||||
#define CHAR_RIGHT_PARENTHESIS '\051'
|
||||
#define CHAR_ASTERISK '\052'
|
||||
#define CHAR_PLUS '\053'
|
||||
#define CHAR_COMMA '\054'
|
||||
#define CHAR_MINUS '\055'
|
||||
#define CHAR_DOT '\056'
|
||||
#define CHAR_SLASH '\057'
|
||||
#define CHAR_0 '\060'
|
||||
#define CHAR_1 '\061'
|
||||
#define CHAR_2 '\062'
|
||||
#define CHAR_3 '\063'
|
||||
#define CHAR_4 '\064'
|
||||
#define CHAR_5 '\065'
|
||||
#define CHAR_6 '\066'
|
||||
#define CHAR_7 '\067'
|
||||
#define CHAR_8 '\070'
|
||||
#define CHAR_9 '\071'
|
||||
#define CHAR_COLON '\072'
|
||||
#define CHAR_SEMICOLON '\073'
|
||||
#define CHAR_LESS_THAN_SIGN '\074'
|
||||
#define CHAR_EQUALS_SIGN '\075'
|
||||
#define CHAR_GREATER_THAN_SIGN '\076'
|
||||
#define CHAR_QUESTION_MARK '\077'
|
||||
#define CHAR_COMMERCIAL_AT '\100'
|
||||
#define CHAR_A '\101'
|
||||
#define CHAR_B '\102'
|
||||
#define CHAR_C '\103'
|
||||
#define CHAR_D '\104'
|
||||
#define CHAR_E '\105'
|
||||
#define CHAR_F '\106'
|
||||
#define CHAR_G '\107'
|
||||
#define CHAR_H '\110'
|
||||
#define CHAR_I '\111'
|
||||
#define CHAR_J '\112'
|
||||
#define CHAR_K '\113'
|
||||
#define CHAR_L '\114'
|
||||
#define CHAR_M '\115'
|
||||
#define CHAR_N '\116'
|
||||
#define CHAR_O '\117'
|
||||
#define CHAR_P '\120'
|
||||
#define CHAR_Q '\121'
|
||||
#define CHAR_R '\122'
|
||||
#define CHAR_S '\123'
|
||||
#define CHAR_T '\124'
|
||||
#define CHAR_U '\125'
|
||||
#define CHAR_V '\126'
|
||||
#define CHAR_W '\127'
|
||||
#define CHAR_X '\130'
|
||||
#define CHAR_Y '\131'
|
||||
#define CHAR_Z '\132'
|
||||
#define CHAR_LEFT_SQUARE_BRACKET '\133'
|
||||
#define CHAR_BACKSLASH '\134'
|
||||
#define CHAR_RIGHT_SQUARE_BRACKET '\135'
|
||||
#define CHAR_CIRCUMFLEX_ACCENT '\136'
|
||||
#define CHAR_UNDERSCORE '\137'
|
||||
#define CHAR_GRAVE_ACCENT '\140'
|
||||
#define CHAR_a '\141'
|
||||
#define CHAR_b '\142'
|
||||
#define CHAR_c '\143'
|
||||
#define CHAR_d '\144'
|
||||
#define CHAR_e '\145'
|
||||
#define CHAR_f '\146'
|
||||
#define CHAR_g '\147'
|
||||
#define CHAR_h '\150'
|
||||
#define CHAR_i '\151'
|
||||
#define CHAR_j '\152'
|
||||
#define CHAR_k '\153'
|
||||
#define CHAR_l '\154'
|
||||
#define CHAR_m '\155'
|
||||
#define CHAR_n '\156'
|
||||
#define CHAR_o '\157'
|
||||
#define CHAR_p '\160'
|
||||
#define CHAR_q '\161'
|
||||
#define CHAR_r '\162'
|
||||
#define CHAR_s '\163'
|
||||
#define CHAR_t '\164'
|
||||
#define CHAR_u '\165'
|
||||
#define CHAR_v '\166'
|
||||
#define CHAR_w '\167'
|
||||
#define CHAR_x '\170'
|
||||
#define CHAR_y '\171'
|
||||
#define CHAR_z '\172'
|
||||
#define CHAR_LEFT_CURLY_BRACKET '\173'
|
||||
#define CHAR_VERTICAL_LINE '\174'
|
||||
#define CHAR_RIGHT_CURLY_BRACKET '\175'
|
||||
#define CHAR_TILDE '\176'
|
||||
|
||||
#define STR_HT "\011"
|
||||
#define STR_VT "\013"
|
||||
#define STR_FF "\014"
|
||||
#define STR_CR "\015"
|
||||
#define STR_NL "\012"
|
||||
#define STR_BS "\010"
|
||||
#define STR_BEL "\007"
|
||||
#define STR_ESC "\033"
|
||||
#define STR_DEL "\177"
|
||||
|
||||
#define STR_SPACE "\040"
|
||||
#define STR_EXCLAMATION_MARK "\041"
|
||||
#define STR_QUOTATION_MARK "\042"
|
||||
#define STR_NUMBER_SIGN "\043"
|
||||
#define STR_DOLLAR_SIGN "\044"
|
||||
#define STR_PERCENT_SIGN "\045"
|
||||
#define STR_AMPERSAND "\046"
|
||||
#define STR_APOSTROPHE "\047"
|
||||
#define STR_LEFT_PARENTHESIS "\050"
|
||||
#define STR_RIGHT_PARENTHESIS "\051"
|
||||
#define STR_ASTERISK "\052"
|
||||
#define STR_PLUS "\053"
|
||||
#define STR_COMMA "\054"
|
||||
#define STR_MINUS "\055"
|
||||
#define STR_DOT "\056"
|
||||
#define STR_SLASH "\057"
|
||||
#define STR_0 "\060"
|
||||
#define STR_1 "\061"
|
||||
#define STR_2 "\062"
|
||||
#define STR_3 "\063"
|
||||
#define STR_4 "\064"
|
||||
#define STR_5 "\065"
|
||||
#define STR_6 "\066"
|
||||
#define STR_7 "\067"
|
||||
#define STR_8 "\070"
|
||||
#define STR_9 "\071"
|
||||
#define STR_COLON "\072"
|
||||
#define STR_SEMICOLON "\073"
|
||||
#define STR_LESS_THAN_SIGN "\074"
|
||||
#define STR_EQUALS_SIGN "\075"
|
||||
#define STR_GREATER_THAN_SIGN "\076"
|
||||
#define STR_QUESTION_MARK "\077"
|
||||
#define STR_COMMERCIAL_AT "\100"
|
||||
#define STR_A "\101"
|
||||
#define STR_B "\102"
|
||||
#define STR_C "\103"
|
||||
#define STR_D "\104"
|
||||
#define STR_E "\105"
|
||||
#define STR_F "\106"
|
||||
#define STR_G "\107"
|
||||
#define STR_H "\110"
|
||||
#define STR_I "\111"
|
||||
#define STR_J "\112"
|
||||
#define STR_K "\113"
|
||||
#define STR_L "\114"
|
||||
#define STR_M "\115"
|
||||
#define STR_N "\116"
|
||||
#define STR_O "\117"
|
||||
#define STR_P "\120"
|
||||
#define STR_Q "\121"
|
||||
#define STR_R "\122"
|
||||
#define STR_S "\123"
|
||||
#define STR_T "\124"
|
||||
#define STR_U "\125"
|
||||
#define STR_V "\126"
|
||||
#define STR_W "\127"
|
||||
#define STR_X "\130"
|
||||
#define STR_Y "\131"
|
||||
#define STR_Z "\132"
|
||||
#define STR_LEFT_SQUARE_BRACKET "\133"
|
||||
#define STR_BACKSLASH "\134"
|
||||
#define STR_RIGHT_SQUARE_BRACKET "\135"
|
||||
#define STR_CIRCUMFLEX_ACCENT "\136"
|
||||
#define STR_UNDERSCORE "\137"
|
||||
#define STR_GRAVE_ACCENT "\140"
|
||||
#define STR_a "\141"
|
||||
#define STR_b "\142"
|
||||
#define STR_c "\143"
|
||||
#define STR_d "\144"
|
||||
#define STR_e "\145"
|
||||
#define STR_f "\146"
|
||||
#define STR_g "\147"
|
||||
#define STR_h "\150"
|
||||
#define STR_i "\151"
|
||||
#define STR_j "\152"
|
||||
#define STR_k "\153"
|
||||
#define STR_l "\154"
|
||||
#define STR_m "\155"
|
||||
#define STR_n "\156"
|
||||
#define STR_o "\157"
|
||||
#define STR_p "\160"
|
||||
#define STR_q "\161"
|
||||
#define STR_r "\162"
|
||||
#define STR_s "\163"
|
||||
#define STR_t "\164"
|
||||
#define STR_u "\165"
|
||||
#define STR_v "\166"
|
||||
#define STR_w "\167"
|
||||
#define STR_x "\170"
|
||||
#define STR_y "\171"
|
||||
#define STR_z "\172"
|
||||
#define STR_LEFT_CURLY_BRACKET "\173"
|
||||
#define STR_VERTICAL_LINE "\174"
|
||||
#define STR_RIGHT_CURLY_BRACKET "\175"
|
||||
#define STR_TILDE "\176"
|
||||
|
||||
#define STRING_ACCEPT0 STR_A STR_C STR_C STR_E STR_P STR_T "\0"
|
||||
#define STRING_COMMIT0 STR_C STR_O STR_M STR_M STR_I STR_T "\0"
|
||||
#define STRING_F0 STR_F "\0"
|
||||
#define STRING_FAIL0 STR_F STR_A STR_I STR_L "\0"
|
||||
#define STRING_PRUNE0 STR_P STR_R STR_U STR_N STR_E "\0"
|
||||
#define STRING_SKIP0 STR_S STR_K STR_I STR_P "\0"
|
||||
#define STRING_THEN STR_T STR_H STR_E STR_N
|
||||
|
||||
#define STRING_alpha0 STR_a STR_l STR_p STR_h STR_a "\0"
|
||||
#define STRING_lower0 STR_l STR_o STR_w STR_e STR_r "\0"
|
||||
#define STRING_upper0 STR_u STR_p STR_p STR_e STR_r "\0"
|
||||
#define STRING_alnum0 STR_a STR_l STR_n STR_u STR_m "\0"
|
||||
#define STRING_ascii0 STR_a STR_s STR_c STR_i STR_i "\0"
|
||||
#define STRING_blank0 STR_b STR_l STR_a STR_n STR_k "\0"
|
||||
#define STRING_cntrl0 STR_c STR_n STR_t STR_r STR_l "\0"
|
||||
#define STRING_digit0 STR_d STR_i STR_g STR_i STR_t "\0"
|
||||
#define STRING_graph0 STR_g STR_r STR_a STR_p STR_h "\0"
|
||||
#define STRING_print0 STR_p STR_r STR_i STR_n STR_t "\0"
|
||||
#define STRING_punct0 STR_p STR_u STR_n STR_c STR_t "\0"
|
||||
#define STRING_space0 STR_s STR_p STR_a STR_c STR_e "\0"
|
||||
#define STRING_word0 STR_w STR_o STR_r STR_d "\0"
|
||||
#define STRING_xdigit STR_x STR_d STR_i STR_g STR_i STR_t
|
||||
|
||||
#define STRING_DEFINE STR_D STR_E STR_F STR_I STR_N STR_E
|
||||
|
||||
#define STRING_CR_RIGHTPAR STR_C STR_R STR_RIGHT_PARENTHESIS
|
||||
#define STRING_LF_RIGHTPAR STR_L STR_F STR_RIGHT_PARENTHESIS
|
||||
#define STRING_CRLF_RIGHTPAR STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
|
||||
#define STRING_ANY_RIGHTPAR STR_A STR_N STR_Y STR_RIGHT_PARENTHESIS
|
||||
#define STRING_ANYCRLF_RIGHTPAR STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
|
||||
#define STRING_BSR_ANYCRLF_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_A STR_N STR_Y STR_C STR_R STR_L STR_F STR_RIGHT_PARENTHESIS
|
||||
#define STRING_BSR_UNICODE_RIGHTPAR STR_B STR_S STR_R STR_UNDERSCORE STR_U STR_N STR_I STR_C STR_O STR_D STR_E STR_RIGHT_PARENTHESIS
|
||||
#define STRING_UTF8_RIGHTPAR STR_U STR_T STR_F STR_8 STR_RIGHT_PARENTHESIS
|
||||
|
||||
#endif /* SUPPORT_UTF8 */
|
||||
|
||||
/* Escape items that are just an encoding of a particular data value. */
|
||||
|
||||
#ifndef ESC_e
|
||||
#define ESC_e 27
|
||||
#define ESC_e CHAR_ESC
|
||||
#endif
|
||||
|
||||
#ifndef ESC_f
|
||||
#define ESC_f '\f'
|
||||
#define ESC_f CHAR_FF
|
||||
#endif
|
||||
|
||||
#ifndef ESC_n
|
||||
#define ESC_n '\n'
|
||||
#define ESC_n CHAR_NL
|
||||
#endif
|
||||
|
||||
#ifndef ESC_r
|
||||
#define ESC_r '\r'
|
||||
#define ESC_r CHAR_CR
|
||||
#endif
|
||||
|
||||
/* We can't officially use ESC_t because it is a POSIX reserved identifier
|
||||
(presumably because of all the others like size_t). */
|
||||
|
||||
#ifndef ESC_tee
|
||||
#define ESC_tee '\t'
|
||||
#define ESC_tee CHAR_HT
|
||||
#endif
|
||||
|
||||
/* Codes for different types of Unicode property */
|
||||
@ -632,8 +1223,8 @@ enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
|
||||
OP_EOD must correspond in order to the list of escapes immediately above.
|
||||
|
||||
*** NOTE NOTE NOTE *** Whenever this list is updated, the two macro definitions
|
||||
that follow must also be updated to match. There is also a table called
|
||||
"coptable" in pcre_dfa_exec.c that must be updated. */
|
||||
that follow must also be updated to match. There are also tables called
|
||||
"coptable" and "poptable" in pcre_dfa_exec.c that must be updated. */
|
||||
|
||||
enum {
|
||||
OP_END, /* 0 End of pattern */
|
||||
@ -769,30 +1360,45 @@ enum {
|
||||
OP_SCBRA, /* 98 Start of capturing bracket, check empty */
|
||||
OP_SCOND, /* 99 Conditional group, check empty */
|
||||
|
||||
OP_CREF, /* 100 Used to hold a capture number as condition */
|
||||
OP_RREF, /* 101 Used to hold a recursion number as condition */
|
||||
OP_DEF, /* 102 The DEFINE condition */
|
||||
/* The next two pairs must (respectively) be kept together. */
|
||||
|
||||
OP_BRAZERO, /* 103 These two must remain together and in this */
|
||||
OP_BRAMINZERO, /* 104 order. */
|
||||
OP_CREF, /* 100 Used to hold a capture number as condition */
|
||||
OP_NCREF, /* 101 Same, but generaged by a name reference*/
|
||||
OP_RREF, /* 102 Used to hold a recursion number as condition */
|
||||
OP_NRREF, /* 103 Same, but generaged by a name reference*/
|
||||
OP_DEF, /* 104 The DEFINE condition */
|
||||
|
||||
OP_BRAZERO, /* 105 These two must remain together and in this */
|
||||
OP_BRAMINZERO, /* 106 order. */
|
||||
|
||||
/* These are backtracking control verbs */
|
||||
|
||||
OP_PRUNE, /* 105 */
|
||||
OP_SKIP, /* 106 */
|
||||
OP_THEN, /* 107 */
|
||||
OP_COMMIT, /* 108 */
|
||||
OP_PRUNE, /* 107 */
|
||||
OP_SKIP, /* 108 */
|
||||
OP_THEN, /* 109 */
|
||||
OP_COMMIT, /* 110 */
|
||||
|
||||
/* These are forced failure and success verbs */
|
||||
|
||||
OP_FAIL, /* 109 */
|
||||
OP_ACCEPT, /* 110 */
|
||||
OP_FAIL, /* 111 */
|
||||
OP_ACCEPT, /* 112 */
|
||||
OP_CLOSE, /* 113 Used before OP_ACCEPT to close open captures */
|
||||
|
||||
/* This is used to skip a subpattern with a {0} quantifier */
|
||||
|
||||
OP_SKIPZERO /* 111 */
|
||||
OP_SKIPZERO, /* 114 */
|
||||
|
||||
/* This is not an opcode, but is used to check that tables indexed by opcode
|
||||
are the correct length, in order to catch updating errors - there have been
|
||||
some in the past. */
|
||||
|
||||
OP_TABLE_LENGTH
|
||||
};
|
||||
|
||||
/* *** NOTE NOTE NOTE *** Whenever the list above is updated, the two macro
|
||||
definitions that follow must also be updated to match. There are also tables
|
||||
called "coptable" cna "poptable" in pcre_dfa_exec.c that must be updated. */
|
||||
|
||||
|
||||
/* This macro defines textual names for all the opcodes. These are used only
|
||||
for debugging. The macro is referenced only in pcre_printint.c. */
|
||||
@ -814,9 +1420,10 @@ for debugging. The macro is referenced only in pcre_printint.c. */
|
||||
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", \
|
||||
"AssertB", "AssertB not", "Reverse", \
|
||||
"Once", "Bra", "CBra", "Cond", "SBra", "SCBra", "SCond", \
|
||||
"Cond ref", "Cond rec", "Cond def", "Brazero", "Braminzero", \
|
||||
"Cond ref", "Cond nref", "Cond rec", "Cond nrec", "Cond def", \
|
||||
"Brazero", "Braminzero", \
|
||||
"*PRUNE", "*SKIP", "*THEN", "*COMMIT", "*FAIL", "*ACCEPT", \
|
||||
"Skip zero"
|
||||
"Close", "Skip zero"
|
||||
|
||||
|
||||
/* This macro defines the length of fixed length operations in the compiled
|
||||
@ -833,8 +1440,9 @@ in UTF-8 mode. The code that uses this table must know about such things. */
|
||||
1, 1, 1, 1, 1, /* \A, \G, \K, \B, \b */ \
|
||||
1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */ \
|
||||
1, 1, 1, /* Any, AllAny, Anybyte */ \
|
||||
3, 3, 1, /* NOTPROP, PROP, EXTUNI */ \
|
||||
3, 3, /* \P, \p */ \
|
||||
1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */ \
|
||||
1, /* \X */ \
|
||||
1, 1, 2, 1, 1, /* \Z, \z, Opt, ^, $ */ \
|
||||
2, /* Char - the minimum length */ \
|
||||
2, /* Charnc - the minimum length */ \
|
||||
@ -876,20 +1484,22 @@ in UTF-8 mode. The code that uses this table must know about such things. */
|
||||
1+LINK_SIZE, /* SBRA */ \
|
||||
3+LINK_SIZE, /* SCBRA */ \
|
||||
1+LINK_SIZE, /* SCOND */ \
|
||||
3, /* CREF */ \
|
||||
3, /* RREF */ \
|
||||
3, 3, /* CREF, NCREF */ \
|
||||
3, 3, /* RREF, NRREF */ \
|
||||
1, /* DEF */ \
|
||||
1, 1, /* BRAZERO, BRAMINZERO */ \
|
||||
1, 1, 1, 1, /* PRUNE, SKIP, THEN, COMMIT, */ \
|
||||
1, 1, 1 /* FAIL, ACCEPT, SKIPZERO */
|
||||
1, 1, 3, 1 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
|
||||
|
||||
|
||||
/* A magic value for OP_RREF to indicate the "any recursion" condition. */
|
||||
/* A magic value for OP_RREF and OP_NRREF to indicate the "any recursion"
|
||||
condition. */
|
||||
|
||||
#define RREF_ANY 0xffff
|
||||
|
||||
/* Error code numbers. They are given names so that they can more easily be
|
||||
tracked. */
|
||||
/* Compile time error code numbers. They are given names so that they can more
|
||||
easily be tracked. When a new number is added, the table called eint in
|
||||
pcreposix.c must be updated. */
|
||||
|
||||
enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
|
||||
ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17, ERR18, ERR19,
|
||||
@ -897,7 +1507,7 @@ enum { ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
|
||||
ERR30, ERR31, ERR32, ERR33, ERR34, ERR35, ERR36, ERR37, ERR38, ERR39,
|
||||
ERR40, ERR41, ERR42, ERR43, ERR44, ERR45, ERR46, ERR47, ERR48, ERR49,
|
||||
ERR50, ERR51, ERR52, ERR53, ERR54, ERR55, ERR56, ERR57, ERR58, ERR59,
|
||||
ERR60, ERR61, ERR62, ERR63, ERR64 };
|
||||
ERR60, ERR61, ERR62, ERR63, ERR64, ERR65, ERRCOUNT };
|
||||
|
||||
/* The real format of the start of the pcre block; the index of names and the
|
||||
code vector run on as long as necessary after the end. We store an explicit
|
||||
@ -913,7 +1523,7 @@ Because people can now save and re-use compiled patterns, any additions to this
|
||||
structure should be made at the end, and something earlier (e.g. a new
|
||||
flag in the options or one of the dummy fields) should indicate that the new
|
||||
fields are present. Currently PCRE always sets the dummy fields to zero.
|
||||
NOTE NOTE NOTE:
|
||||
NOTE NOTE NOTE
|
||||
*/
|
||||
|
||||
typedef struct real_pcre {
|
||||
@ -940,10 +1550,22 @@ remark (see NOTE above) about extending this structure applies. */
|
||||
|
||||
typedef struct pcre_study_data {
|
||||
pcre_uint32 size; /* Total that was malloced */
|
||||
pcre_uint32 options;
|
||||
uschar start_bits[32];
|
||||
pcre_uint32 flags; /* Private flags */
|
||||
uschar start_bits[32]; /* Starting char bits */
|
||||
pcre_uint32 minlength; /* Minimum subject length */
|
||||
} pcre_study_data;
|
||||
|
||||
/* Structure for building a chain of open capturing subpatterns during
|
||||
compiling, so that instructions to close them can be compiled when (*ACCEPT) is
|
||||
encountered. This is also used to identify subpatterns that contain recursive
|
||||
back references to themselves, so that they can be made atomic. */
|
||||
|
||||
typedef struct open_capitem {
|
||||
struct open_capitem *next; /* Chain link */
|
||||
pcre_uint16 number; /* Capture number */
|
||||
pcre_uint16 flag; /* Set TRUE if recursive back ref */
|
||||
} open_capitem;
|
||||
|
||||
/* Structure for passing "static" information around between the functions
|
||||
doing the compiling, so that they are thread-safe. */
|
||||
|
||||
@ -956,6 +1578,7 @@ typedef struct compile_data {
|
||||
const uschar *start_code; /* The start of the compiled code */
|
||||
const uschar *start_pattern; /* The start of the pattern */
|
||||
const uschar *end_pattern; /* The end of the pattern */
|
||||
open_capitem *open_caps; /* Chain of open capture items */
|
||||
uschar *hwm; /* High watermark of workspace */
|
||||
uschar *name_table; /* The name/number table */
|
||||
int names_found; /* Number of entries so far */
|
||||
@ -968,6 +1591,7 @@ typedef struct compile_data {
|
||||
int external_flags; /* External flag bits to be set */
|
||||
int req_varyopt; /* "After variable item" flag for reqbyte */
|
||||
BOOL had_accept; /* (*ACCEPT) encountered */
|
||||
BOOL check_lookbehind; /* Lookbehinds need later checking */
|
||||
int nltype; /* Newline type */
|
||||
int nllen; /* Newline string length */
|
||||
uschar nl[4]; /* Newline string when fixed length */
|
||||
@ -978,7 +1602,7 @@ branches, for testing for left recursion. */
|
||||
|
||||
typedef struct branch_chain {
|
||||
struct branch_chain *outer;
|
||||
uschar *current;
|
||||
uschar *current_branch;
|
||||
} branch_chain;
|
||||
|
||||
/* Structure for items in a linked list that represents an explicit recursive
|
||||
@ -988,9 +1612,9 @@ typedef struct recursion_info {
|
||||
struct recursion_info *prevrec; /* Previous recursion record (or NULL) */
|
||||
int group_num; /* Number of group that was called */
|
||||
const uschar *after_call; /* "Return value": points after the call in the expr */
|
||||
USPTR save_start; /* Old value of mstart */
|
||||
int *offset_save; /* Pointer to start of saved offsets */
|
||||
int saved_max; /* Number of saved offsets */
|
||||
int save_offset_top; /* Current value of offset_top */
|
||||
} recursion_info;
|
||||
|
||||
/* Structure for building a chain of data for holding the values of the subject
|
||||
@ -1015,6 +1639,9 @@ typedef struct match_data {
|
||||
int offset_max; /* The maximum usable for return data */
|
||||
int nltype; /* Newline type */
|
||||
int nllen; /* Newline string length */
|
||||
int name_count; /* Number of names in name table */
|
||||
int name_entry_size; /* Size of entry in names table */
|
||||
uschar *name_table; /* Table of names */
|
||||
uschar nl[4]; /* Newline string when fixed */
|
||||
const uschar *lcc; /* Points to lower casing table */
|
||||
const uschar *ctypes; /* Points to table of type maps */
|
||||
@ -1025,7 +1652,7 @@ typedef struct match_data {
|
||||
BOOL jscript_compat; /* JAVASCRIPT_COMPAT flag */
|
||||
BOOL endonly; /* Dollar not before final \n */
|
||||
BOOL notempty; /* Empty string match not wanted */
|
||||
BOOL partial; /* PARTIAL flag */
|
||||
BOOL notempty_atstart; /* Empty string match at start not wanted */
|
||||
BOOL hitend; /* Hit the end of the subject at some point */
|
||||
BOOL bsr_anycrlf; /* \R is just any CRLF, not full Unicode */
|
||||
const uschar *start_code; /* For use when recursing */
|
||||
@ -1033,6 +1660,8 @@ typedef struct match_data {
|
||||
USPTR end_subject; /* End of the subject string */
|
||||
USPTR start_match_ptr; /* Start of matched string */
|
||||
USPTR end_match_ptr; /* Subject position at end match */
|
||||
USPTR start_used_ptr; /* Earliest consulted character */
|
||||
int partial; /* PARTIAL options */
|
||||
int end_offset_top; /* Highwater mark at end of match */
|
||||
int capture_last; /* Most recent capture number */
|
||||
int start_offset; /* The start offset value */
|
||||
@ -1049,7 +1678,9 @@ typedef struct dfa_match_data {
|
||||
const uschar *start_code; /* Start of the compiled pattern */
|
||||
const uschar *start_subject; /* Start of the subject string */
|
||||
const uschar *end_subject; /* End of subject string */
|
||||
const uschar *start_used_ptr; /* Earliest consulted character */
|
||||
const uschar *tables; /* Character tables */
|
||||
int start_offset; /* The start offset value */
|
||||
int moptions; /* Match options */
|
||||
int poptions; /* Pattern options */
|
||||
int nltype; /* Newline type */
|
||||
@ -1128,20 +1759,30 @@ extern const uschar _pcre_OP_lengths[];
|
||||
one of the exported public functions. They have to be "external" in the C
|
||||
sense, but are not part of the PCRE public API. */
|
||||
|
||||
extern BOOL _pcre_is_newline(const uschar *, int, const uschar *,
|
||||
int *, BOOL);
|
||||
extern int _pcre_ord2utf8(int, uschar *);
|
||||
extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
|
||||
const pcre_study_data *, pcre_study_data *);
|
||||
extern int _pcre_valid_utf8(const uschar *, int);
|
||||
extern BOOL _pcre_was_newline(const uschar *, int, const uschar *,
|
||||
int *, BOOL);
|
||||
extern BOOL _pcre_xclass(int, const uschar *);
|
||||
extern unsigned int _pcre_ucp_othercase(unsigned int);
|
||||
extern const uschar *_pcre_find_bracket(const uschar *, BOOL, int);
|
||||
extern BOOL _pcre_is_newline(USPTR, int, USPTR, int *, BOOL);
|
||||
extern int _pcre_ord2utf8(int, uschar *);
|
||||
extern real_pcre *_pcre_try_flipped(const real_pcre *, real_pcre *,
|
||||
const pcre_study_data *, pcre_study_data *);
|
||||
extern int _pcre_valid_utf8(USPTR, int);
|
||||
extern BOOL _pcre_was_newline(USPTR, int, USPTR, int *, BOOL);
|
||||
extern BOOL _pcre_xclass(int, const uschar *);
|
||||
|
||||
|
||||
/* Unicode character database (UCD) */
|
||||
|
||||
typedef struct {
|
||||
uschar script;
|
||||
uschar chartype;
|
||||
pcre_int32 other_case;
|
||||
} ucd_record;
|
||||
|
||||
extern const ucd_record _pcre_ucd_records[];
|
||||
extern const uschar _pcre_ucd_stage1[];
|
||||
extern const pcre_uint16 _pcre_ucd_stage2[];
|
||||
extern const int _pcre_ucp_gentype[];
|
||||
|
||||
extern unsigned int _pcre_ucp_othercase (unsigned int);
|
||||
|
||||
/* UCD access macros */
|
||||
|
||||
|
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -73,8 +73,7 @@ Returns: TRUE or FALSE
|
||||
*/
|
||||
|
||||
BOOL
|
||||
_pcre_is_newline(const uschar *ptr, int type, const uschar *endptr,
|
||||
int *lenptr, BOOL utf8)
|
||||
_pcre_is_newline(USPTR ptr, int type, USPTR endptr, int *lenptr, BOOL utf8)
|
||||
{
|
||||
int c;
|
||||
if (utf8) { GETCHAR(c, ptr); } else c = *ptr;
|
||||
@ -123,8 +122,7 @@ Returns: TRUE or FALSE
|
||||
*/
|
||||
|
||||
BOOL
|
||||
_pcre_was_newline(const uschar *ptr, int type, const uschar *startptr,
|
||||
int *lenptr, BOOL utf8)
|
||||
_pcre_was_newline(USPTR ptr, int type, USPTR startptr, int *lenptr, BOOL utf8)
|
||||
{
|
||||
int c;
|
||||
ptr--;
|
||||
|
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
Copyright (c) 1997-2010 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -54,6 +54,379 @@ supporting functions. */
|
||||
enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Find the minimum subject length for a group *
|
||||
*************************************************/
|
||||
|
||||
/* Scan a parenthesized group and compute the minimum length of subject that
|
||||
is needed to match it. This is a lower bound; it does not mean there is a
|
||||
string of that length that matches. In UTF8 mode, the result is in characters
|
||||
rather than bytes.
|
||||
|
||||
Arguments:
|
||||
code pointer to start of group (the bracket)
|
||||
startcode pointer to start of the whole pattern
|
||||
options the compiling options
|
||||
|
||||
Returns: the minimum length
|
||||
-1 if \C was encountered
|
||||
-2 internal error (missing capturing bracket)
|
||||
*/
|
||||
|
||||
static int
|
||||
find_minlength(const uschar *code, const uschar *startcode, int options)
|
||||
{
|
||||
int length = -1;
|
||||
BOOL utf8 = (options & PCRE_UTF8) != 0;
|
||||
BOOL had_recurse = FALSE;
|
||||
register int branchlength = 0;
|
||||
register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
|
||||
|
||||
if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
|
||||
|
||||
/* Scan along the opcodes for this branch. If we get to the end of the
|
||||
branch, check the length against that of the other branches. */
|
||||
|
||||
for (;;)
|
||||
{
|
||||
int d, min;
|
||||
uschar *cs, *ce;
|
||||
register int op = *cc;
|
||||
|
||||
switch (op)
|
||||
{
|
||||
case OP_COND:
|
||||
case OP_SCOND:
|
||||
|
||||
/* If there is only one branch in a condition, the implied branch has zero
|
||||
length, so we don't add anything. This covers the DEFINE "condition"
|
||||
automatically. */
|
||||
|
||||
cs = cc + GET(cc, 1);
|
||||
if (*cs != OP_ALT)
|
||||
{
|
||||
cc = cs + 1 + LINK_SIZE;
|
||||
break;
|
||||
}
|
||||
|
||||
/* Otherwise we can fall through and treat it the same as any other
|
||||
subpattern. */
|
||||
|
||||
case OP_CBRA:
|
||||
case OP_SCBRA:
|
||||
case OP_BRA:
|
||||
case OP_SBRA:
|
||||
case OP_ONCE:
|
||||
d = find_minlength(cc, startcode, options);
|
||||
if (d < 0) return d;
|
||||
branchlength += d;
|
||||
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
||||
cc += 1 + LINK_SIZE;
|
||||
break;
|
||||
|
||||
/* Reached end of a branch; if it's a ket it is the end of a nested
|
||||
call. If it's ALT it is an alternation in a nested call. If it is
|
||||
END it's the end of the outer call. All can be handled by the same code. */
|
||||
|
||||
case OP_ALT:
|
||||
case OP_KET:
|
||||
case OP_KETRMAX:
|
||||
case OP_KETRMIN:
|
||||
case OP_END:
|
||||
if (length < 0 || (!had_recurse && branchlength < length))
|
||||
length = branchlength;
|
||||
if (*cc != OP_ALT) return length;
|
||||
cc += 1 + LINK_SIZE;
|
||||
branchlength = 0;
|
||||
had_recurse = FALSE;
|
||||
break;
|
||||
|
||||
/* Skip over assertive subpatterns */
|
||||
|
||||
case OP_ASSERT:
|
||||
case OP_ASSERT_NOT:
|
||||
case OP_ASSERTBACK:
|
||||
case OP_ASSERTBACK_NOT:
|
||||
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
||||
/* Fall through */
|
||||
|
||||
/* Skip over things that don't match chars */
|
||||
|
||||
case OP_REVERSE:
|
||||
case OP_CREF:
|
||||
case OP_NCREF:
|
||||
case OP_RREF:
|
||||
case OP_NRREF:
|
||||
case OP_DEF:
|
||||
case OP_OPT:
|
||||
case OP_CALLOUT:
|
||||
case OP_SOD:
|
||||
case OP_SOM:
|
||||
case OP_EOD:
|
||||
case OP_EODN:
|
||||
case OP_CIRC:
|
||||
case OP_DOLL:
|
||||
case OP_NOT_WORD_BOUNDARY:
|
||||
case OP_WORD_BOUNDARY:
|
||||
cc += _pcre_OP_lengths[*cc];
|
||||
break;
|
||||
|
||||
/* Skip over a subpattern that has a {0} or {0,x} quantifier */
|
||||
|
||||
case OP_BRAZERO:
|
||||
case OP_BRAMINZERO:
|
||||
case OP_SKIPZERO:
|
||||
cc += _pcre_OP_lengths[*cc];
|
||||
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
||||
cc += 1 + LINK_SIZE;
|
||||
break;
|
||||
|
||||
/* Handle literal characters and + repetitions */
|
||||
|
||||
case OP_CHAR:
|
||||
case OP_CHARNC:
|
||||
case OP_NOT:
|
||||
case OP_PLUS:
|
||||
case OP_MINPLUS:
|
||||
case OP_POSPLUS:
|
||||
case OP_NOTPLUS:
|
||||
case OP_NOTMINPLUS:
|
||||
case OP_NOTPOSPLUS:
|
||||
branchlength++;
|
||||
cc += 2;
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
|
||||
#endif
|
||||
break;
|
||||
|
||||
case OP_TYPEPLUS:
|
||||
case OP_TYPEMINPLUS:
|
||||
case OP_TYPEPOSPLUS:
|
||||
branchlength++;
|
||||
cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
|
||||
break;
|
||||
|
||||
/* Handle exact repetitions. The count is already in characters, but we
|
||||
need to skip over a multibyte character in UTF8 mode. */
|
||||
|
||||
case OP_EXACT:
|
||||
case OP_NOTEXACT:
|
||||
branchlength += GET2(cc,1);
|
||||
cc += 4;
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
|
||||
#endif
|
||||
break;
|
||||
|
||||
case OP_TYPEEXACT:
|
||||
branchlength += GET2(cc,1);
|
||||
cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
|
||||
break;
|
||||
|
||||
/* Handle single-char non-literal matchers */
|
||||
|
||||
case OP_PROP:
|
||||
case OP_NOTPROP:
|
||||
cc += 2;
|
||||
/* Fall through */
|
||||
|
||||
case OP_NOT_DIGIT:
|
||||
case OP_DIGIT:
|
||||
case OP_NOT_WHITESPACE:
|
||||
case OP_WHITESPACE:
|
||||
case OP_NOT_WORDCHAR:
|
||||
case OP_WORDCHAR:
|
||||
case OP_ANY:
|
||||
case OP_ALLANY:
|
||||
case OP_EXTUNI:
|
||||
case OP_HSPACE:
|
||||
case OP_NOT_HSPACE:
|
||||
case OP_VSPACE:
|
||||
case OP_NOT_VSPACE:
|
||||
branchlength++;
|
||||
cc++;
|
||||
break;
|
||||
|
||||
/* "Any newline" might match two characters */
|
||||
|
||||
case OP_ANYNL:
|
||||
branchlength += 2;
|
||||
cc++;
|
||||
break;
|
||||
|
||||
/* The single-byte matcher means we can't proceed in UTF-8 mode */
|
||||
|
||||
case OP_ANYBYTE:
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8) return -1;
|
||||
#endif
|
||||
branchlength++;
|
||||
cc++;
|
||||
break;
|
||||
|
||||
/* For repeated character types, we have to test for \p and \P, which have
|
||||
an extra two bytes of parameters. */
|
||||
|
||||
case OP_TYPESTAR:
|
||||
case OP_TYPEMINSTAR:
|
||||
case OP_TYPEQUERY:
|
||||
case OP_TYPEMINQUERY:
|
||||
case OP_TYPEPOSSTAR:
|
||||
case OP_TYPEPOSQUERY:
|
||||
if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
|
||||
cc += _pcre_OP_lengths[op];
|
||||
break;
|
||||
|
||||
case OP_TYPEUPTO:
|
||||
case OP_TYPEMINUPTO:
|
||||
case OP_TYPEPOSUPTO:
|
||||
if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
|
||||
cc += _pcre_OP_lengths[op];
|
||||
break;
|
||||
|
||||
/* Check a class for variable quantification */
|
||||
|
||||
#ifdef SUPPORT_UTF8
|
||||
case OP_XCLASS:
|
||||
cc += GET(cc, 1) - 33;
|
||||
/* Fall through */
|
||||
#endif
|
||||
|
||||
case OP_CLASS:
|
||||
case OP_NCLASS:
|
||||
cc += 33;
|
||||
|
||||
switch (*cc)
|
||||
{
|
||||
case OP_CRPLUS:
|
||||
case OP_CRMINPLUS:
|
||||
branchlength++;
|
||||
/* Fall through */
|
||||
|
||||
case OP_CRSTAR:
|
||||
case OP_CRMINSTAR:
|
||||
case OP_CRQUERY:
|
||||
case OP_CRMINQUERY:
|
||||
cc++;
|
||||
break;
|
||||
|
||||
case OP_CRRANGE:
|
||||
case OP_CRMINRANGE:
|
||||
branchlength += GET2(cc,1);
|
||||
cc += 5;
|
||||
break;
|
||||
|
||||
default:
|
||||
branchlength++;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
|
||||
/* Backreferences and subroutine calls are treated in the same way: we find
|
||||
the minimum length for the subpattern. A recursion, however, causes an
|
||||
a flag to be set that causes the length of this branch to be ignored. The
|
||||
logic is that a recursion can only make sense if there is another
|
||||
alternation that stops the recursing. That will provide the minimum length
|
||||
(when no recursion happens). A backreference within the group that it is
|
||||
referencing behaves in the same way.
|
||||
|
||||
If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
|
||||
matches an empty string (by default it causes a matching failure), so in
|
||||
that case we must set the minimum length to zero. */
|
||||
|
||||
case OP_REF:
|
||||
if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
|
||||
{
|
||||
ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
|
||||
if (cs == NULL) return -2;
|
||||
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
||||
if (cc > cs && cc < ce)
|
||||
{
|
||||
d = 0;
|
||||
had_recurse = TRUE;
|
||||
}
|
||||
else d = find_minlength(cs, startcode, options);
|
||||
}
|
||||
else d = 0;
|
||||
cc += 3;
|
||||
|
||||
/* Handle repeated back references */
|
||||
|
||||
switch (*cc)
|
||||
{
|
||||
case OP_CRSTAR:
|
||||
case OP_CRMINSTAR:
|
||||
case OP_CRQUERY:
|
||||
case OP_CRMINQUERY:
|
||||
min = 0;
|
||||
cc++;
|
||||
break;
|
||||
|
||||
case OP_CRRANGE:
|
||||
case OP_CRMINRANGE:
|
||||
min = GET2(cc, 1);
|
||||
cc += 5;
|
||||
break;
|
||||
|
||||
default:
|
||||
min = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
branchlength += min * d;
|
||||
break;
|
||||
|
||||
case OP_RECURSE:
|
||||
cs = ce = (uschar *)startcode + GET(cc, 1);
|
||||
if (cs == NULL) return -2;
|
||||
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
||||
if (cc > cs && cc < ce)
|
||||
had_recurse = TRUE;
|
||||
else
|
||||
branchlength += find_minlength(cs, startcode, options);
|
||||
cc += 1 + LINK_SIZE;
|
||||
break;
|
||||
|
||||
/* Anything else does not or need not match a character. We can get the
|
||||
item's length from the table, but for those that can match zero occurrences
|
||||
of a character, we must take special action for UTF-8 characters. */
|
||||
|
||||
case OP_UPTO:
|
||||
case OP_NOTUPTO:
|
||||
case OP_MINUPTO:
|
||||
case OP_NOTMINUPTO:
|
||||
case OP_POSUPTO:
|
||||
case OP_STAR:
|
||||
case OP_MINSTAR:
|
||||
case OP_NOTMINSTAR:
|
||||
case OP_POSSTAR:
|
||||
case OP_NOTPOSSTAR:
|
||||
case OP_QUERY:
|
||||
case OP_MINQUERY:
|
||||
case OP_NOTMINQUERY:
|
||||
case OP_POSQUERY:
|
||||
case OP_NOTPOSQUERY:
|
||||
cc += _pcre_OP_lengths[op];
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
|
||||
#endif
|
||||
break;
|
||||
|
||||
/* For the record, these are the opcodes that are matched by "default":
|
||||
OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
|
||||
OP_THEN. */
|
||||
|
||||
default:
|
||||
cc += _pcre_OP_lengths[op];
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* Control never gets here */
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Set a bit and maybe its alternate case *
|
||||
*************************************************/
|
||||
@ -71,7 +444,8 @@ Returns: nothing
|
||||
*/
|
||||
|
||||
static void
|
||||
set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
|
||||
set_table_bit(uschar *start_bits, unsigned int c, BOOL caseless,
|
||||
compile_data *cd)
|
||||
{
|
||||
start_bits[c/8] |= (1 << (c&7));
|
||||
if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
|
||||
@ -233,7 +607,7 @@ do
|
||||
case OP_QUERY:
|
||||
case OP_MINQUERY:
|
||||
case OP_POSQUERY:
|
||||
set_bit(start_bits, tcode[1], caseless, cd);
|
||||
set_table_bit(start_bits, tcode[1], caseless, cd);
|
||||
tcode += 2;
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8 && tcode[-1] >= 0xc0)
|
||||
@ -246,7 +620,7 @@ do
|
||||
case OP_UPTO:
|
||||
case OP_MINUPTO:
|
||||
case OP_POSUPTO:
|
||||
set_bit(start_bits, tcode[3], caseless, cd);
|
||||
set_table_bit(start_bits, tcode[3], caseless, cd);
|
||||
tcode += 4;
|
||||
#ifdef SUPPORT_UTF8
|
||||
if (utf8 && tcode[-1] >= 0xc0)
|
||||
@ -264,7 +638,7 @@ do
|
||||
case OP_PLUS:
|
||||
case OP_MINPLUS:
|
||||
case OP_POSPLUS:
|
||||
set_bit(start_bits, tcode[1], caseless, cd);
|
||||
set_table_bit(start_bits, tcode[1], caseless, cd);
|
||||
try_next = FALSE;
|
||||
break;
|
||||
|
||||
@ -500,13 +874,15 @@ Arguments:
|
||||
set NULL unless error
|
||||
|
||||
Returns: pointer to a pcre_extra block, with study_data filled in and the
|
||||
appropriate flag set;
|
||||
appropriate flags set;
|
||||
NULL on error or if no optimization possible
|
||||
*/
|
||||
|
||||
PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
|
||||
pcre_study(const pcre *external_re, int options, const char **errorptr)
|
||||
{
|
||||
int min;
|
||||
BOOL bits_set = FALSE;
|
||||
uschar start_bits[32];
|
||||
pcre_extra *extra;
|
||||
pcre_study_data *study;
|
||||
@ -533,30 +909,39 @@ code = (uschar *)re + re->name_table_offset +
|
||||
(re->name_count * re->name_entry_size);
|
||||
|
||||
/* For an anchored pattern, or an unanchored pattern that has a first char, or
|
||||
a multiline pattern that matches only at "line starts", no further processing
|
||||
at present. */
|
||||
a multiline pattern that matches only at "line starts", there is no point in
|
||||
seeking a list of starting bytes. */
|
||||
|
||||
if ((re->options & PCRE_ANCHORED) != 0 ||
|
||||
(re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
|
||||
return NULL;
|
||||
if ((re->options & PCRE_ANCHORED) == 0 &&
|
||||
(re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
|
||||
{
|
||||
/* Set the character tables in the block that is passed around */
|
||||
|
||||
/* Set the character tables in the block that is passed around */
|
||||
tables = re->tables;
|
||||
if (tables == NULL)
|
||||
(void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
|
||||
(void *)(&tables));
|
||||
|
||||
tables = re->tables;
|
||||
if (tables == NULL)
|
||||
(void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
|
||||
(void *)(&tables));
|
||||
compile_block.lcc = tables + lcc_offset;
|
||||
compile_block.fcc = tables + fcc_offset;
|
||||
compile_block.cbits = tables + cbits_offset;
|
||||
compile_block.ctypes = tables + ctypes_offset;
|
||||
|
||||
compile_block.lcc = tables + lcc_offset;
|
||||
compile_block.fcc = tables + fcc_offset;
|
||||
compile_block.cbits = tables + cbits_offset;
|
||||
compile_block.ctypes = tables + ctypes_offset;
|
||||
/* See if we can find a fixed set of initial characters for the pattern. */
|
||||
|
||||
/* See if we can find a fixed set of initial characters for the pattern. */
|
||||
memset(start_bits, 0, 32 * sizeof(uschar));
|
||||
bits_set = set_start_bits(code, start_bits,
|
||||
(re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
|
||||
&compile_block) == SSB_DONE;
|
||||
}
|
||||
|
||||
memset(start_bits, 0, 32 * sizeof(uschar));
|
||||
if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
|
||||
(re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
|
||||
/* Find the minimum length of subject string. */
|
||||
|
||||
min = find_minlength(code, code, re->options);
|
||||
|
||||
/* Return NULL if no optimization is possible. */
|
||||
|
||||
if (!bits_set && min < 0) return NULL;
|
||||
|
||||
/* Get a pcre_extra block and a pcre_study_data block. The study data is put in
|
||||
the latter, which is pointed to by the former, which may also get additional
|
||||
@ -579,8 +964,19 @@ extra->flags = PCRE_EXTRA_STUDY_DATA;
|
||||
extra->study_data = study;
|
||||
|
||||
study->size = sizeof(pcre_study_data);
|
||||
study->options = PCRE_STUDY_MAPPED;
|
||||
memcpy(study->start_bits, start_bits, sizeof(start_bits));
|
||||
study->flags = 0;
|
||||
|
||||
if (bits_set)
|
||||
{
|
||||
study->flags |= PCRE_STUDY_MAPPED;
|
||||
memcpy(study->start_bits, start_bits, sizeof(start_bits));
|
||||
}
|
||||
|
||||
if (min >= 0)
|
||||
{
|
||||
study->flags |= PCRE_STUDY_MINLEN;
|
||||
study->minlength = min;
|
||||
}
|
||||
|
||||
return extra;
|
||||
}
|
||||
|
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -109,244 +109,411 @@ putting all the names into a single, large string and then using offsets in the
|
||||
table itself. Maintenance is more error-prone, but frequent changes to this
|
||||
data are unlikely.
|
||||
|
||||
July 2008: There is now a script called maint/GenerateUtt.py which can be used
|
||||
to generate this data instead of maintaining it entirely by hand. */
|
||||
July 2008: There is now a script called maint/GenerateUtt.py that can be used
|
||||
to generate this data instead of maintaining it entirely by hand.
|
||||
|
||||
The script was updated in March 2009 to generate a new EBCDIC-compliant
|
||||
version. Like all other character and string literals that are compared against
|
||||
the regular expression pattern, we must use STR_ macros instead of literal
|
||||
strings to make sure that UTF-8 support works on EBCDIC platforms. */
|
||||
|
||||
#define STRING_Any0 STR_A STR_n STR_y "\0"
|
||||
#define STRING_Arabic0 STR_A STR_r STR_a STR_b STR_i STR_c "\0"
|
||||
#define STRING_Armenian0 STR_A STR_r STR_m STR_e STR_n STR_i STR_a STR_n "\0"
|
||||
#define STRING_Avestan0 STR_A STR_v STR_e STR_s STR_t STR_a STR_n "\0"
|
||||
#define STRING_Balinese0 STR_B STR_a STR_l STR_i STR_n STR_e STR_s STR_e "\0"
|
||||
#define STRING_Bamum0 STR_B STR_a STR_m STR_u STR_m "\0"
|
||||
#define STRING_Bengali0 STR_B STR_e STR_n STR_g STR_a STR_l STR_i "\0"
|
||||
#define STRING_Bopomofo0 STR_B STR_o STR_p STR_o STR_m STR_o STR_f STR_o "\0"
|
||||
#define STRING_Braille0 STR_B STR_r STR_a STR_i STR_l STR_l STR_e "\0"
|
||||
#define STRING_Buginese0 STR_B STR_u STR_g STR_i STR_n STR_e STR_s STR_e "\0"
|
||||
#define STRING_Buhid0 STR_B STR_u STR_h STR_i STR_d "\0"
|
||||
#define STRING_C0 STR_C "\0"
|
||||
#define STRING_Canadian_Aboriginal0 STR_C STR_a STR_n STR_a STR_d STR_i STR_a STR_n STR_UNDERSCORE STR_A STR_b STR_o STR_r STR_i STR_g STR_i STR_n STR_a STR_l "\0"
|
||||
#define STRING_Carian0 STR_C STR_a STR_r STR_i STR_a STR_n "\0"
|
||||
#define STRING_Cc0 STR_C STR_c "\0"
|
||||
#define STRING_Cf0 STR_C STR_f "\0"
|
||||
#define STRING_Cham0 STR_C STR_h STR_a STR_m "\0"
|
||||
#define STRING_Cherokee0 STR_C STR_h STR_e STR_r STR_o STR_k STR_e STR_e "\0"
|
||||
#define STRING_Cn0 STR_C STR_n "\0"
|
||||
#define STRING_Co0 STR_C STR_o "\0"
|
||||
#define STRING_Common0 STR_C STR_o STR_m STR_m STR_o STR_n "\0"
|
||||
#define STRING_Coptic0 STR_C STR_o STR_p STR_t STR_i STR_c "\0"
|
||||
#define STRING_Cs0 STR_C STR_s "\0"
|
||||
#define STRING_Cuneiform0 STR_C STR_u STR_n STR_e STR_i STR_f STR_o STR_r STR_m "\0"
|
||||
#define STRING_Cypriot0 STR_C STR_y STR_p STR_r STR_i STR_o STR_t "\0"
|
||||
#define STRING_Cyrillic0 STR_C STR_y STR_r STR_i STR_l STR_l STR_i STR_c "\0"
|
||||
#define STRING_Deseret0 STR_D STR_e STR_s STR_e STR_r STR_e STR_t "\0"
|
||||
#define STRING_Devanagari0 STR_D STR_e STR_v STR_a STR_n STR_a STR_g STR_a STR_r STR_i "\0"
|
||||
#define STRING_Egyptian_Hieroglyphs0 STR_E STR_g STR_y STR_p STR_t STR_i STR_a STR_n STR_UNDERSCORE STR_H STR_i STR_e STR_r STR_o STR_g STR_l STR_y STR_p STR_h STR_s "\0"
|
||||
#define STRING_Ethiopic0 STR_E STR_t STR_h STR_i STR_o STR_p STR_i STR_c "\0"
|
||||
#define STRING_Georgian0 STR_G STR_e STR_o STR_r STR_g STR_i STR_a STR_n "\0"
|
||||
#define STRING_Glagolitic0 STR_G STR_l STR_a STR_g STR_o STR_l STR_i STR_t STR_i STR_c "\0"
|
||||
#define STRING_Gothic0 STR_G STR_o STR_t STR_h STR_i STR_c "\0"
|
||||
#define STRING_Greek0 STR_G STR_r STR_e STR_e STR_k "\0"
|
||||
#define STRING_Gujarati0 STR_G STR_u STR_j STR_a STR_r STR_a STR_t STR_i "\0"
|
||||
#define STRING_Gurmukhi0 STR_G STR_u STR_r STR_m STR_u STR_k STR_h STR_i "\0"
|
||||
#define STRING_Han0 STR_H STR_a STR_n "\0"
|
||||
#define STRING_Hangul0 STR_H STR_a STR_n STR_g STR_u STR_l "\0"
|
||||
#define STRING_Hanunoo0 STR_H STR_a STR_n STR_u STR_n STR_o STR_o "\0"
|
||||
#define STRING_Hebrew0 STR_H STR_e STR_b STR_r STR_e STR_w "\0"
|
||||
#define STRING_Hiragana0 STR_H STR_i STR_r STR_a STR_g STR_a STR_n STR_a "\0"
|
||||
#define STRING_Imperial_Aramaic0 STR_I STR_m STR_p STR_e STR_r STR_i STR_a STR_l STR_UNDERSCORE STR_A STR_r STR_a STR_m STR_a STR_i STR_c "\0"
|
||||
#define STRING_Inherited0 STR_I STR_n STR_h STR_e STR_r STR_i STR_t STR_e STR_d "\0"
|
||||
#define STRING_Inscriptional_Pahlavi0 STR_I STR_n STR_s STR_c STR_r STR_i STR_p STR_t STR_i STR_o STR_n STR_a STR_l STR_UNDERSCORE STR_P STR_a STR_h STR_l STR_a STR_v STR_i "\0"
|
||||
#define STRING_Inscriptional_Parthian0 STR_I STR_n STR_s STR_c STR_r STR_i STR_p STR_t STR_i STR_o STR_n STR_a STR_l STR_UNDERSCORE STR_P STR_a STR_r STR_t STR_h STR_i STR_a STR_n "\0"
|
||||
#define STRING_Javanese0 STR_J STR_a STR_v STR_a STR_n STR_e STR_s STR_e "\0"
|
||||
#define STRING_Kaithi0 STR_K STR_a STR_i STR_t STR_h STR_i "\0"
|
||||
#define STRING_Kannada0 STR_K STR_a STR_n STR_n STR_a STR_d STR_a "\0"
|
||||
#define STRING_Katakana0 STR_K STR_a STR_t STR_a STR_k STR_a STR_n STR_a "\0"
|
||||
#define STRING_Kayah_Li0 STR_K STR_a STR_y STR_a STR_h STR_UNDERSCORE STR_L STR_i "\0"
|
||||
#define STRING_Kharoshthi0 STR_K STR_h STR_a STR_r STR_o STR_s STR_h STR_t STR_h STR_i "\0"
|
||||
#define STRING_Khmer0 STR_K STR_h STR_m STR_e STR_r "\0"
|
||||
#define STRING_L0 STR_L "\0"
|
||||
#define STRING_L_AMPERSAND0 STR_L STR_AMPERSAND "\0"
|
||||
#define STRING_Lao0 STR_L STR_a STR_o "\0"
|
||||
#define STRING_Latin0 STR_L STR_a STR_t STR_i STR_n "\0"
|
||||
#define STRING_Lepcha0 STR_L STR_e STR_p STR_c STR_h STR_a "\0"
|
||||
#define STRING_Limbu0 STR_L STR_i STR_m STR_b STR_u "\0"
|
||||
#define STRING_Linear_B0 STR_L STR_i STR_n STR_e STR_a STR_r STR_UNDERSCORE STR_B "\0"
|
||||
#define STRING_Lisu0 STR_L STR_i STR_s STR_u "\0"
|
||||
#define STRING_Ll0 STR_L STR_l "\0"
|
||||
#define STRING_Lm0 STR_L STR_m "\0"
|
||||
#define STRING_Lo0 STR_L STR_o "\0"
|
||||
#define STRING_Lt0 STR_L STR_t "\0"
|
||||
#define STRING_Lu0 STR_L STR_u "\0"
|
||||
#define STRING_Lycian0 STR_L STR_y STR_c STR_i STR_a STR_n "\0"
|
||||
#define STRING_Lydian0 STR_L STR_y STR_d STR_i STR_a STR_n "\0"
|
||||
#define STRING_M0 STR_M "\0"
|
||||
#define STRING_Malayalam0 STR_M STR_a STR_l STR_a STR_y STR_a STR_l STR_a STR_m "\0"
|
||||
#define STRING_Mc0 STR_M STR_c "\0"
|
||||
#define STRING_Me0 STR_M STR_e "\0"
|
||||
#define STRING_Meetei_Mayek0 STR_M STR_e STR_e STR_t STR_e STR_i STR_UNDERSCORE STR_M STR_a STR_y STR_e STR_k "\0"
|
||||
#define STRING_Mn0 STR_M STR_n "\0"
|
||||
#define STRING_Mongolian0 STR_M STR_o STR_n STR_g STR_o STR_l STR_i STR_a STR_n "\0"
|
||||
#define STRING_Myanmar0 STR_M STR_y STR_a STR_n STR_m STR_a STR_r "\0"
|
||||
#define STRING_N0 STR_N "\0"
|
||||
#define STRING_Nd0 STR_N STR_d "\0"
|
||||
#define STRING_New_Tai_Lue0 STR_N STR_e STR_w STR_UNDERSCORE STR_T STR_a STR_i STR_UNDERSCORE STR_L STR_u STR_e "\0"
|
||||
#define STRING_Nko0 STR_N STR_k STR_o "\0"
|
||||
#define STRING_Nl0 STR_N STR_l "\0"
|
||||
#define STRING_No0 STR_N STR_o "\0"
|
||||
#define STRING_Ogham0 STR_O STR_g STR_h STR_a STR_m "\0"
|
||||
#define STRING_Ol_Chiki0 STR_O STR_l STR_UNDERSCORE STR_C STR_h STR_i STR_k STR_i "\0"
|
||||
#define STRING_Old_Italic0 STR_O STR_l STR_d STR_UNDERSCORE STR_I STR_t STR_a STR_l STR_i STR_c "\0"
|
||||
#define STRING_Old_Persian0 STR_O STR_l STR_d STR_UNDERSCORE STR_P STR_e STR_r STR_s STR_i STR_a STR_n "\0"
|
||||
#define STRING_Old_South_Arabian0 STR_O STR_l STR_d STR_UNDERSCORE STR_S STR_o STR_u STR_t STR_h STR_UNDERSCORE STR_A STR_r STR_a STR_b STR_i STR_a STR_n "\0"
|
||||
#define STRING_Old_Turkic0 STR_O STR_l STR_d STR_UNDERSCORE STR_T STR_u STR_r STR_k STR_i STR_c "\0"
|
||||
#define STRING_Oriya0 STR_O STR_r STR_i STR_y STR_a "\0"
|
||||
#define STRING_Osmanya0 STR_O STR_s STR_m STR_a STR_n STR_y STR_a "\0"
|
||||
#define STRING_P0 STR_P "\0"
|
||||
#define STRING_Pc0 STR_P STR_c "\0"
|
||||
#define STRING_Pd0 STR_P STR_d "\0"
|
||||
#define STRING_Pe0 STR_P STR_e "\0"
|
||||
#define STRING_Pf0 STR_P STR_f "\0"
|
||||
#define STRING_Phags_Pa0 STR_P STR_h STR_a STR_g STR_s STR_UNDERSCORE STR_P STR_a "\0"
|
||||
#define STRING_Phoenician0 STR_P STR_h STR_o STR_e STR_n STR_i STR_c STR_i STR_a STR_n "\0"
|
||||
#define STRING_Pi0 STR_P STR_i "\0"
|
||||
#define STRING_Po0 STR_P STR_o "\0"
|
||||
#define STRING_Ps0 STR_P STR_s "\0"
|
||||
#define STRING_Rejang0 STR_R STR_e STR_j STR_a STR_n STR_g "\0"
|
||||
#define STRING_Runic0 STR_R STR_u STR_n STR_i STR_c "\0"
|
||||
#define STRING_S0 STR_S "\0"
|
||||
#define STRING_Samaritan0 STR_S STR_a STR_m STR_a STR_r STR_i STR_t STR_a STR_n "\0"
|
||||
#define STRING_Saurashtra0 STR_S STR_a STR_u STR_r STR_a STR_s STR_h STR_t STR_r STR_a "\0"
|
||||
#define STRING_Sc0 STR_S STR_c "\0"
|
||||
#define STRING_Shavian0 STR_S STR_h STR_a STR_v STR_i STR_a STR_n "\0"
|
||||
#define STRING_Sinhala0 STR_S STR_i STR_n STR_h STR_a STR_l STR_a "\0"
|
||||
#define STRING_Sk0 STR_S STR_k "\0"
|
||||
#define STRING_Sm0 STR_S STR_m "\0"
|
||||
#define STRING_So0 STR_S STR_o "\0"
|
||||
#define STRING_Sundanese0 STR_S STR_u STR_n STR_d STR_a STR_n STR_e STR_s STR_e "\0"
|
||||
#define STRING_Syloti_Nagri0 STR_S STR_y STR_l STR_o STR_t STR_i STR_UNDERSCORE STR_N STR_a STR_g STR_r STR_i "\0"
|
||||
#define STRING_Syriac0 STR_S STR_y STR_r STR_i STR_a STR_c "\0"
|
||||
#define STRING_Tagalog0 STR_T STR_a STR_g STR_a STR_l STR_o STR_g "\0"
|
||||
#define STRING_Tagbanwa0 STR_T STR_a STR_g STR_b STR_a STR_n STR_w STR_a "\0"
|
||||
#define STRING_Tai_Le0 STR_T STR_a STR_i STR_UNDERSCORE STR_L STR_e "\0"
|
||||
#define STRING_Tai_Tham0 STR_T STR_a STR_i STR_UNDERSCORE STR_T STR_h STR_a STR_m "\0"
|
||||
#define STRING_Tai_Viet0 STR_T STR_a STR_i STR_UNDERSCORE STR_V STR_i STR_e STR_t "\0"
|
||||
#define STRING_Tamil0 STR_T STR_a STR_m STR_i STR_l "\0"
|
||||
#define STRING_Telugu0 STR_T STR_e STR_l STR_u STR_g STR_u "\0"
|
||||
#define STRING_Thaana0 STR_T STR_h STR_a STR_a STR_n STR_a "\0"
|
||||
#define STRING_Thai0 STR_T STR_h STR_a STR_i "\0"
|
||||
#define STRING_Tibetan0 STR_T STR_i STR_b STR_e STR_t STR_a STR_n "\0"
|
||||
#define STRING_Tifinagh0 STR_T STR_i STR_f STR_i STR_n STR_a STR_g STR_h "\0"
|
||||
#define STRING_Ugaritic0 STR_U STR_g STR_a STR_r STR_i STR_t STR_i STR_c "\0"
|
||||
#define STRING_Vai0 STR_V STR_a STR_i "\0"
|
||||
#define STRING_Yi0 STR_Y STR_i "\0"
|
||||
#define STRING_Z0 STR_Z "\0"
|
||||
#define STRING_Zl0 STR_Z STR_l "\0"
|
||||
#define STRING_Zp0 STR_Z STR_p "\0"
|
||||
#define STRING_Zs0 STR_Z STR_s "\0"
|
||||
|
||||
const char _pcre_utt_names[] =
|
||||
"Any\0"
|
||||
"Arabic\0"
|
||||
"Armenian\0"
|
||||
"Balinese\0"
|
||||
"Bengali\0"
|
||||
"Bopomofo\0"
|
||||
"Braille\0"
|
||||
"Buginese\0"
|
||||
"Buhid\0"
|
||||
"C\0"
|
||||
"Canadian_Aboriginal\0"
|
||||
"Carian\0"
|
||||
"Cc\0"
|
||||
"Cf\0"
|
||||
"Cham\0"
|
||||
"Cherokee\0"
|
||||
"Cn\0"
|
||||
"Co\0"
|
||||
"Common\0"
|
||||
"Coptic\0"
|
||||
"Cs\0"
|
||||
"Cuneiform\0"
|
||||
"Cypriot\0"
|
||||
"Cyrillic\0"
|
||||
"Deseret\0"
|
||||
"Devanagari\0"
|
||||
"Ethiopic\0"
|
||||
"Georgian\0"
|
||||
"Glagolitic\0"
|
||||
"Gothic\0"
|
||||
"Greek\0"
|
||||
"Gujarati\0"
|
||||
"Gurmukhi\0"
|
||||
"Han\0"
|
||||
"Hangul\0"
|
||||
"Hanunoo\0"
|
||||
"Hebrew\0"
|
||||
"Hiragana\0"
|
||||
"Inherited\0"
|
||||
"Kannada\0"
|
||||
"Katakana\0"
|
||||
"Kayah_Li\0"
|
||||
"Kharoshthi\0"
|
||||
"Khmer\0"
|
||||
"L\0"
|
||||
"L&\0"
|
||||
"Lao\0"
|
||||
"Latin\0"
|
||||
"Lepcha\0"
|
||||
"Limbu\0"
|
||||
"Linear_B\0"
|
||||
"Ll\0"
|
||||
"Lm\0"
|
||||
"Lo\0"
|
||||
"Lt\0"
|
||||
"Lu\0"
|
||||
"Lycian\0"
|
||||
"Lydian\0"
|
||||
"M\0"
|
||||
"Malayalam\0"
|
||||
"Mc\0"
|
||||
"Me\0"
|
||||
"Mn\0"
|
||||
"Mongolian\0"
|
||||
"Myanmar\0"
|
||||
"N\0"
|
||||
"Nd\0"
|
||||
"New_Tai_Lue\0"
|
||||
"Nko\0"
|
||||
"Nl\0"
|
||||
"No\0"
|
||||
"Ogham\0"
|
||||
"Ol_Chiki\0"
|
||||
"Old_Italic\0"
|
||||
"Old_Persian\0"
|
||||
"Oriya\0"
|
||||
"Osmanya\0"
|
||||
"P\0"
|
||||
"Pc\0"
|
||||
"Pd\0"
|
||||
"Pe\0"
|
||||
"Pf\0"
|
||||
"Phags_Pa\0"
|
||||
"Phoenician\0"
|
||||
"Pi\0"
|
||||
"Po\0"
|
||||
"Ps\0"
|
||||
"Rejang\0"
|
||||
"Runic\0"
|
||||
"S\0"
|
||||
"Saurashtra\0"
|
||||
"Sc\0"
|
||||
"Shavian\0"
|
||||
"Sinhala\0"
|
||||
"Sk\0"
|
||||
"Sm\0"
|
||||
"So\0"
|
||||
"Sundanese\0"
|
||||
"Syloti_Nagri\0"
|
||||
"Syriac\0"
|
||||
"Tagalog\0"
|
||||
"Tagbanwa\0"
|
||||
"Tai_Le\0"
|
||||
"Tamil\0"
|
||||
"Telugu\0"
|
||||
"Thaana\0"
|
||||
"Thai\0"
|
||||
"Tibetan\0"
|
||||
"Tifinagh\0"
|
||||
"Ugaritic\0"
|
||||
"Vai\0"
|
||||
"Yi\0"
|
||||
"Z\0"
|
||||
"Zl\0"
|
||||
"Zp\0"
|
||||
"Zs\0";
|
||||
STRING_Any0
|
||||
STRING_Arabic0
|
||||
STRING_Armenian0
|
||||
STRING_Avestan0
|
||||
STRING_Balinese0
|
||||
STRING_Bamum0
|
||||
STRING_Bengali0
|
||||
STRING_Bopomofo0
|
||||
STRING_Braille0
|
||||
STRING_Buginese0
|
||||
STRING_Buhid0
|
||||
STRING_C0
|
||||
STRING_Canadian_Aboriginal0
|
||||
STRING_Carian0
|
||||
STRING_Cc0
|
||||
STRING_Cf0
|
||||
STRING_Cham0
|
||||
STRING_Cherokee0
|
||||
STRING_Cn0
|
||||
STRING_Co0
|
||||
STRING_Common0
|
||||
STRING_Coptic0
|
||||
STRING_Cs0
|
||||
STRING_Cuneiform0
|
||||
STRING_Cypriot0
|
||||
STRING_Cyrillic0
|
||||
STRING_Deseret0
|
||||
STRING_Devanagari0
|
||||
STRING_Egyptian_Hieroglyphs0
|
||||
STRING_Ethiopic0
|
||||
STRING_Georgian0
|
||||
STRING_Glagolitic0
|
||||
STRING_Gothic0
|
||||
STRING_Greek0
|
||||
STRING_Gujarati0
|
||||
STRING_Gurmukhi0
|
||||
STRING_Han0
|
||||
STRING_Hangul0
|
||||
STRING_Hanunoo0
|
||||
STRING_Hebrew0
|
||||
STRING_Hiragana0
|
||||
STRING_Imperial_Aramaic0
|
||||
STRING_Inherited0
|
||||
STRING_Inscriptional_Pahlavi0
|
||||
STRING_Inscriptional_Parthian0
|
||||
STRING_Javanese0
|
||||
STRING_Kaithi0
|
||||
STRING_Kannada0
|
||||
STRING_Katakana0
|
||||
STRING_Kayah_Li0
|
||||
STRING_Kharoshthi0
|
||||
STRING_Khmer0
|
||||
STRING_L0
|
||||
STRING_L_AMPERSAND0
|
||||
STRING_Lao0
|
||||
STRING_Latin0
|
||||
STRING_Lepcha0
|
||||
STRING_Limbu0
|
||||
STRING_Linear_B0
|
||||
STRING_Lisu0
|
||||
STRING_Ll0
|
||||
STRING_Lm0
|
||||
STRING_Lo0
|
||||
STRING_Lt0
|
||||
STRING_Lu0
|
||||
STRING_Lycian0
|
||||
STRING_Lydian0
|
||||
STRING_M0
|
||||
STRING_Malayalam0
|
||||
STRING_Mc0
|
||||
STRING_Me0
|
||||
STRING_Meetei_Mayek0
|
||||
STRING_Mn0
|
||||
STRING_Mongolian0
|
||||
STRING_Myanmar0
|
||||
STRING_N0
|
||||
STRING_Nd0
|
||||
STRING_New_Tai_Lue0
|
||||
STRING_Nko0
|
||||
STRING_Nl0
|
||||
STRING_No0
|
||||
STRING_Ogham0
|
||||
STRING_Ol_Chiki0
|
||||
STRING_Old_Italic0
|
||||
STRING_Old_Persian0
|
||||
STRING_Old_South_Arabian0
|
||||
STRING_Old_Turkic0
|
||||
STRING_Oriya0
|
||||
STRING_Osmanya0
|
||||
STRING_P0
|
||||
STRING_Pc0
|
||||
STRING_Pd0
|
||||
STRING_Pe0
|
||||
STRING_Pf0
|
||||
STRING_Phags_Pa0
|
||||
STRING_Phoenician0
|
||||
STRING_Pi0
|
||||
STRING_Po0
|
||||
STRING_Ps0
|
||||
STRING_Rejang0
|
||||
STRING_Runic0
|
||||
STRING_S0
|
||||
STRING_Samaritan0
|
||||
STRING_Saurashtra0
|
||||
STRING_Sc0
|
||||
STRING_Shavian0
|
||||
STRING_Sinhala0
|
||||
STRING_Sk0
|
||||
STRING_Sm0
|
||||
STRING_So0
|
||||
STRING_Sundanese0
|
||||
STRING_Syloti_Nagri0
|
||||
STRING_Syriac0
|
||||
STRING_Tagalog0
|
||||
STRING_Tagbanwa0
|
||||
STRING_Tai_Le0
|
||||
STRING_Tai_Tham0
|
||||
STRING_Tai_Viet0
|
||||
STRING_Tamil0
|
||||
STRING_Telugu0
|
||||
STRING_Thaana0
|
||||
STRING_Thai0
|
||||
STRING_Tibetan0
|
||||
STRING_Tifinagh0
|
||||
STRING_Ugaritic0
|
||||
STRING_Vai0
|
||||
STRING_Yi0
|
||||
STRING_Z0
|
||||
STRING_Zl0
|
||||
STRING_Zp0
|
||||
STRING_Zs0;
|
||||
|
||||
const ucp_type_table _pcre_utt[] = {
|
||||
{ 0, PT_ANY, 0 },
|
||||
{ 4, PT_SC, ucp_Arabic },
|
||||
{ 11, PT_SC, ucp_Armenian },
|
||||
{ 20, PT_SC, ucp_Balinese },
|
||||
{ 29, PT_SC, ucp_Bengali },
|
||||
{ 37, PT_SC, ucp_Bopomofo },
|
||||
{ 46, PT_SC, ucp_Braille },
|
||||
{ 54, PT_SC, ucp_Buginese },
|
||||
{ 63, PT_SC, ucp_Buhid },
|
||||
{ 69, PT_GC, ucp_C },
|
||||
{ 71, PT_SC, ucp_Canadian_Aboriginal },
|
||||
{ 91, PT_SC, ucp_Carian },
|
||||
{ 98, PT_PC, ucp_Cc },
|
||||
{ 101, PT_PC, ucp_Cf },
|
||||
{ 104, PT_SC, ucp_Cham },
|
||||
{ 109, PT_SC, ucp_Cherokee },
|
||||
{ 118, PT_PC, ucp_Cn },
|
||||
{ 121, PT_PC, ucp_Co },
|
||||
{ 124, PT_SC, ucp_Common },
|
||||
{ 131, PT_SC, ucp_Coptic },
|
||||
{ 138, PT_PC, ucp_Cs },
|
||||
{ 141, PT_SC, ucp_Cuneiform },
|
||||
{ 151, PT_SC, ucp_Cypriot },
|
||||
{ 159, PT_SC, ucp_Cyrillic },
|
||||
{ 168, PT_SC, ucp_Deseret },
|
||||
{ 176, PT_SC, ucp_Devanagari },
|
||||
{ 187, PT_SC, ucp_Ethiopic },
|
||||
{ 196, PT_SC, ucp_Georgian },
|
||||
{ 205, PT_SC, ucp_Glagolitic },
|
||||
{ 216, PT_SC, ucp_Gothic },
|
||||
{ 223, PT_SC, ucp_Greek },
|
||||
{ 229, PT_SC, ucp_Gujarati },
|
||||
{ 238, PT_SC, ucp_Gurmukhi },
|
||||
{ 247, PT_SC, ucp_Han },
|
||||
{ 251, PT_SC, ucp_Hangul },
|
||||
{ 258, PT_SC, ucp_Hanunoo },
|
||||
{ 266, PT_SC, ucp_Hebrew },
|
||||
{ 273, PT_SC, ucp_Hiragana },
|
||||
{ 282, PT_SC, ucp_Inherited },
|
||||
{ 292, PT_SC, ucp_Kannada },
|
||||
{ 300, PT_SC, ucp_Katakana },
|
||||
{ 309, PT_SC, ucp_Kayah_Li },
|
||||
{ 318, PT_SC, ucp_Kharoshthi },
|
||||
{ 329, PT_SC, ucp_Khmer },
|
||||
{ 335, PT_GC, ucp_L },
|
||||
{ 337, PT_LAMP, 0 },
|
||||
{ 340, PT_SC, ucp_Lao },
|
||||
{ 344, PT_SC, ucp_Latin },
|
||||
{ 350, PT_SC, ucp_Lepcha },
|
||||
{ 357, PT_SC, ucp_Limbu },
|
||||
{ 363, PT_SC, ucp_Linear_B },
|
||||
{ 372, PT_PC, ucp_Ll },
|
||||
{ 375, PT_PC, ucp_Lm },
|
||||
{ 378, PT_PC, ucp_Lo },
|
||||
{ 381, PT_PC, ucp_Lt },
|
||||
{ 384, PT_PC, ucp_Lu },
|
||||
{ 387, PT_SC, ucp_Lycian },
|
||||
{ 394, PT_SC, ucp_Lydian },
|
||||
{ 401, PT_GC, ucp_M },
|
||||
{ 403, PT_SC, ucp_Malayalam },
|
||||
{ 413, PT_PC, ucp_Mc },
|
||||
{ 416, PT_PC, ucp_Me },
|
||||
{ 419, PT_PC, ucp_Mn },
|
||||
{ 422, PT_SC, ucp_Mongolian },
|
||||
{ 432, PT_SC, ucp_Myanmar },
|
||||
{ 440, PT_GC, ucp_N },
|
||||
{ 442, PT_PC, ucp_Nd },
|
||||
{ 445, PT_SC, ucp_New_Tai_Lue },
|
||||
{ 457, PT_SC, ucp_Nko },
|
||||
{ 461, PT_PC, ucp_Nl },
|
||||
{ 464, PT_PC, ucp_No },
|
||||
{ 467, PT_SC, ucp_Ogham },
|
||||
{ 473, PT_SC, ucp_Ol_Chiki },
|
||||
{ 482, PT_SC, ucp_Old_Italic },
|
||||
{ 493, PT_SC, ucp_Old_Persian },
|
||||
{ 505, PT_SC, ucp_Oriya },
|
||||
{ 511, PT_SC, ucp_Osmanya },
|
||||
{ 519, PT_GC, ucp_P },
|
||||
{ 521, PT_PC, ucp_Pc },
|
||||
{ 524, PT_PC, ucp_Pd },
|
||||
{ 527, PT_PC, ucp_Pe },
|
||||
{ 530, PT_PC, ucp_Pf },
|
||||
{ 533, PT_SC, ucp_Phags_Pa },
|
||||
{ 542, PT_SC, ucp_Phoenician },
|
||||
{ 553, PT_PC, ucp_Pi },
|
||||
{ 556, PT_PC, ucp_Po },
|
||||
{ 559, PT_PC, ucp_Ps },
|
||||
{ 562, PT_SC, ucp_Rejang },
|
||||
{ 569, PT_SC, ucp_Runic },
|
||||
{ 575, PT_GC, ucp_S },
|
||||
{ 577, PT_SC, ucp_Saurashtra },
|
||||
{ 588, PT_PC, ucp_Sc },
|
||||
{ 591, PT_SC, ucp_Shavian },
|
||||
{ 599, PT_SC, ucp_Sinhala },
|
||||
{ 607, PT_PC, ucp_Sk },
|
||||
{ 610, PT_PC, ucp_Sm },
|
||||
{ 613, PT_PC, ucp_So },
|
||||
{ 616, PT_SC, ucp_Sundanese },
|
||||
{ 626, PT_SC, ucp_Syloti_Nagri },
|
||||
{ 639, PT_SC, ucp_Syriac },
|
||||
{ 646, PT_SC, ucp_Tagalog },
|
||||
{ 654, PT_SC, ucp_Tagbanwa },
|
||||
{ 663, PT_SC, ucp_Tai_Le },
|
||||
{ 670, PT_SC, ucp_Tamil },
|
||||
{ 676, PT_SC, ucp_Telugu },
|
||||
{ 683, PT_SC, ucp_Thaana },
|
||||
{ 690, PT_SC, ucp_Thai },
|
||||
{ 695, PT_SC, ucp_Tibetan },
|
||||
{ 703, PT_SC, ucp_Tifinagh },
|
||||
{ 712, PT_SC, ucp_Ugaritic },
|
||||
{ 721, PT_SC, ucp_Vai },
|
||||
{ 725, PT_SC, ucp_Yi },
|
||||
{ 728, PT_GC, ucp_Z },
|
||||
{ 730, PT_PC, ucp_Zl },
|
||||
{ 733, PT_PC, ucp_Zp },
|
||||
{ 736, PT_PC, ucp_Zs }
|
||||
{ 20, PT_SC, ucp_Avestan },
|
||||
{ 28, PT_SC, ucp_Balinese },
|
||||
{ 37, PT_SC, ucp_Bamum },
|
||||
{ 43, PT_SC, ucp_Bengali },
|
||||
{ 51, PT_SC, ucp_Bopomofo },
|
||||
{ 60, PT_SC, ucp_Braille },
|
||||
{ 68, PT_SC, ucp_Buginese },
|
||||
{ 77, PT_SC, ucp_Buhid },
|
||||
{ 83, PT_GC, ucp_C },
|
||||
{ 85, PT_SC, ucp_Canadian_Aboriginal },
|
||||
{ 105, PT_SC, ucp_Carian },
|
||||
{ 112, PT_PC, ucp_Cc },
|
||||
{ 115, PT_PC, ucp_Cf },
|
||||
{ 118, PT_SC, ucp_Cham },
|
||||
{ 123, PT_SC, ucp_Cherokee },
|
||||
{ 132, PT_PC, ucp_Cn },
|
||||
{ 135, PT_PC, ucp_Co },
|
||||
{ 138, PT_SC, ucp_Common },
|
||||
{ 145, PT_SC, ucp_Coptic },
|
||||
{ 152, PT_PC, ucp_Cs },
|
||||
{ 155, PT_SC, ucp_Cuneiform },
|
||||
{ 165, PT_SC, ucp_Cypriot },
|
||||
{ 173, PT_SC, ucp_Cyrillic },
|
||||
{ 182, PT_SC, ucp_Deseret },
|
||||
{ 190, PT_SC, ucp_Devanagari },
|
||||
{ 201, PT_SC, ucp_Egyptian_Hieroglyphs },
|
||||
{ 222, PT_SC, ucp_Ethiopic },
|
||||
{ 231, PT_SC, ucp_Georgian },
|
||||
{ 240, PT_SC, ucp_Glagolitic },
|
||||
{ 251, PT_SC, ucp_Gothic },
|
||||
{ 258, PT_SC, ucp_Greek },
|
||||
{ 264, PT_SC, ucp_Gujarati },
|
||||
{ 273, PT_SC, ucp_Gurmukhi },
|
||||
{ 282, PT_SC, ucp_Han },
|
||||
{ 286, PT_SC, ucp_Hangul },
|
||||
{ 293, PT_SC, ucp_Hanunoo },
|
||||
{ 301, PT_SC, ucp_Hebrew },
|
||||
{ 308, PT_SC, ucp_Hiragana },
|
||||
{ 317, PT_SC, ucp_Imperial_Aramaic },
|
||||
{ 334, PT_SC, ucp_Inherited },
|
||||
{ 344, PT_SC, ucp_Inscriptional_Pahlavi },
|
||||
{ 366, PT_SC, ucp_Inscriptional_Parthian },
|
||||
{ 389, PT_SC, ucp_Javanese },
|
||||
{ 398, PT_SC, ucp_Kaithi },
|
||||
{ 405, PT_SC, ucp_Kannada },
|
||||
{ 413, PT_SC, ucp_Katakana },
|
||||
{ 422, PT_SC, ucp_Kayah_Li },
|
||||
{ 431, PT_SC, ucp_Kharoshthi },
|
||||
{ 442, PT_SC, ucp_Khmer },
|
||||
{ 448, PT_GC, ucp_L },
|
||||
{ 450, PT_LAMP, 0 },
|
||||
{ 453, PT_SC, ucp_Lao },
|
||||
{ 457, PT_SC, ucp_Latin },
|
||||
{ 463, PT_SC, ucp_Lepcha },
|
||||
{ 470, PT_SC, ucp_Limbu },
|
||||
{ 476, PT_SC, ucp_Linear_B },
|
||||
{ 485, PT_SC, ucp_Lisu },
|
||||
{ 490, PT_PC, ucp_Ll },
|
||||
{ 493, PT_PC, ucp_Lm },
|
||||
{ 496, PT_PC, ucp_Lo },
|
||||
{ 499, PT_PC, ucp_Lt },
|
||||
{ 502, PT_PC, ucp_Lu },
|
||||
{ 505, PT_SC, ucp_Lycian },
|
||||
{ 512, PT_SC, ucp_Lydian },
|
||||
{ 519, PT_GC, ucp_M },
|
||||
{ 521, PT_SC, ucp_Malayalam },
|
||||
{ 531, PT_PC, ucp_Mc },
|
||||
{ 534, PT_PC, ucp_Me },
|
||||
{ 537, PT_SC, ucp_Meetei_Mayek },
|
||||
{ 550, PT_PC, ucp_Mn },
|
||||
{ 553, PT_SC, ucp_Mongolian },
|
||||
{ 563, PT_SC, ucp_Myanmar },
|
||||
{ 571, PT_GC, ucp_N },
|
||||
{ 573, PT_PC, ucp_Nd },
|
||||
{ 576, PT_SC, ucp_New_Tai_Lue },
|
||||
{ 588, PT_SC, ucp_Nko },
|
||||
{ 592, PT_PC, ucp_Nl },
|
||||
{ 595, PT_PC, ucp_No },
|
||||
{ 598, PT_SC, ucp_Ogham },
|
||||
{ 604, PT_SC, ucp_Ol_Chiki },
|
||||
{ 613, PT_SC, ucp_Old_Italic },
|
||||
{ 624, PT_SC, ucp_Old_Persian },
|
||||
{ 636, PT_SC, ucp_Old_South_Arabian },
|
||||
{ 654, PT_SC, ucp_Old_Turkic },
|
||||
{ 665, PT_SC, ucp_Oriya },
|
||||
{ 671, PT_SC, ucp_Osmanya },
|
||||
{ 679, PT_GC, ucp_P },
|
||||
{ 681, PT_PC, ucp_Pc },
|
||||
{ 684, PT_PC, ucp_Pd },
|
||||
{ 687, PT_PC, ucp_Pe },
|
||||
{ 690, PT_PC, ucp_Pf },
|
||||
{ 693, PT_SC, ucp_Phags_Pa },
|
||||
{ 702, PT_SC, ucp_Phoenician },
|
||||
{ 713, PT_PC, ucp_Pi },
|
||||
{ 716, PT_PC, ucp_Po },
|
||||
{ 719, PT_PC, ucp_Ps },
|
||||
{ 722, PT_SC, ucp_Rejang },
|
||||
{ 729, PT_SC, ucp_Runic },
|
||||
{ 735, PT_GC, ucp_S },
|
||||
{ 737, PT_SC, ucp_Samaritan },
|
||||
{ 747, PT_SC, ucp_Saurashtra },
|
||||
{ 758, PT_PC, ucp_Sc },
|
||||
{ 761, PT_SC, ucp_Shavian },
|
||||
{ 769, PT_SC, ucp_Sinhala },
|
||||
{ 777, PT_PC, ucp_Sk },
|
||||
{ 780, PT_PC, ucp_Sm },
|
||||
{ 783, PT_PC, ucp_So },
|
||||
{ 786, PT_SC, ucp_Sundanese },
|
||||
{ 796, PT_SC, ucp_Syloti_Nagri },
|
||||
{ 809, PT_SC, ucp_Syriac },
|
||||
{ 816, PT_SC, ucp_Tagalog },
|
||||
{ 824, PT_SC, ucp_Tagbanwa },
|
||||
{ 833, PT_SC, ucp_Tai_Le },
|
||||
{ 840, PT_SC, ucp_Tai_Tham },
|
||||
{ 849, PT_SC, ucp_Tai_Viet },
|
||||
{ 858, PT_SC, ucp_Tamil },
|
||||
{ 864, PT_SC, ucp_Telugu },
|
||||
{ 871, PT_SC, ucp_Thaana },
|
||||
{ 878, PT_SC, ucp_Thai },
|
||||
{ 883, PT_SC, ucp_Tibetan },
|
||||
{ 891, PT_SC, ucp_Tifinagh },
|
||||
{ 900, PT_SC, ucp_Ugaritic },
|
||||
{ 909, PT_SC, ucp_Vai },
|
||||
{ 913, PT_SC, ucp_Yi },
|
||||
{ 916, PT_GC, ucp_Z },
|
||||
{ 918, PT_PC, ucp_Zl },
|
||||
{ 921, PT_PC, ucp_Zp },
|
||||
{ 924, PT_PC, ucp_Zs }
|
||||
};
|
||||
|
||||
const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table);
|
||||
|
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -128,7 +128,9 @@ if (study != NULL)
|
||||
{
|
||||
*internal_study = *study; /* To copy other fields */
|
||||
internal_study->size = byteflip(study->size, sizeof(study->size));
|
||||
internal_study->options = byteflip(study->options, sizeof(study->options));
|
||||
internal_study->flags = byteflip(study->flags, sizeof(study->flags));
|
||||
internal_study->minlength = byteflip(study->minlength,
|
||||
sizeof(study->minlength));
|
||||
}
|
||||
|
||||
return internal_re;
|
||||
|
@ -52,6 +52,50 @@ properties. */
|
||||
#include "ucp.h" /* Category definitions */
|
||||
|
||||
|
||||
/* Table to translate from particular type value to the general value. */
|
||||
|
||||
static int ucp_gentype[] = {
|
||||
ucp_C, ucp_C, ucp_C, ucp_C, ucp_C, /* Cc, Cf, Cn, Co, Cs */
|
||||
ucp_L, ucp_L, ucp_L, ucp_L, ucp_L, /* Ll, Lu, Lm, Lo, Lt */
|
||||
ucp_M, ucp_M, ucp_M, /* Mc, Me, Mn */
|
||||
ucp_N, ucp_N, ucp_N, /* Nd, Nl, No */
|
||||
ucp_P, ucp_P, ucp_P, ucp_P, ucp_P, /* Pc, Pd, Pe, Pf, Pi */
|
||||
ucp_P, ucp_P, /* Ps, Po */
|
||||
ucp_S, ucp_S, ucp_S, ucp_S, /* Sc, Sk, Sm, So */
|
||||
ucp_Z, ucp_Z, ucp_Z /* Zl, Zp, Zs */
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Search table and return type *
|
||||
*************************************************/
|
||||
|
||||
/* Three values are returned: the category is ucp_C, ucp_L, etc. The detailed
|
||||
character type is ucp_Lu, ucp_Nd, etc. The script is ucp_Latin, etc.
|
||||
|
||||
Arguments:
|
||||
c the character value
|
||||
type_ptr the detailed character type is returned here
|
||||
script_ptr the script is returned here
|
||||
|
||||
Returns: the character type category
|
||||
*/
|
||||
|
||||
int
|
||||
_pcre_ucp_findprop(const unsigned int c, int *type_ptr, int *script_ptr)
|
||||
{
|
||||
/* Note that the Unicode types have the same values in glib and in
|
||||
* PCRE, so ucp_Ll == G_UNICODE_LOWERCASE_LETTER,
|
||||
* ucp_Zs == G_UNICODE_SPACE_SEPARATOR, and so on. */
|
||||
*type_ptr = g_unichar_type(c);
|
||||
*script_ptr = g_unichar_get_script(c);
|
||||
return ucp_gentype[*type_ptr];
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/*************************************************
|
||||
* Search table and return other case *
|
||||
*************************************************/
|
||||
@ -68,7 +112,7 @@ Returns: the other case or NOTACHAR if none
|
||||
unsigned int
|
||||
_pcre_ucp_othercase(const unsigned int c)
|
||||
{
|
||||
unsigned int other_case = NOTACHAR;
|
||||
int other_case = NOTACHAR;
|
||||
|
||||
if (g_unichar_islower(c))
|
||||
other_case = g_unichar_toupper(c);
|
||||
|
@ -6,7 +6,7 @@
|
||||
and semantics are as close as possible to those of the Perl 5 language.
|
||||
|
||||
Written by Philip Hazel
|
||||
Copyright (c) 1997-2008 University of Cambridge
|
||||
Copyright (c) 1997-2009 University of Cambridge
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@ -39,8 +39,7 @@ POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
/* This module contains an internal function that is used to match an extended
|
||||
class (one that contains characters whose values are > 255). It is used by both
|
||||
pcre_exec() and pcre_def_exec(). */
|
||||
class. It is used by both pcre_exec() and pcre_def_exec(). */
|
||||
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
@ -55,7 +54,7 @@ pcre_exec() and pcre_def_exec(). */
|
||||
*************************************************/
|
||||
|
||||
/* This function is called to match a character against an extended class that
|
||||
might contain values > 255.
|
||||
might contain values > 255 and/or Unicode properties.
|
||||
|
||||
Arguments:
|
||||
c the character
|
||||
|
@ -6,9 +6,8 @@
|
||||
#define _UCP_H
|
||||
|
||||
/* This file contains definitions of the property values that are returned by
|
||||
the function _pcre_ucp_findprop(). New values that are added for new releases
|
||||
of Unicode should always be at the end of each enum, for backwards
|
||||
compatibility. */
|
||||
the UCD access macros. New values that are added for new releases of Unicode
|
||||
should always be at the end of each enum, for backwards compatibility. */
|
||||
|
||||
/* These are the general character categories. */
|
||||
|
||||
@ -121,24 +120,40 @@ enum {
|
||||
ucp_Tifinagh = G_UNICODE_SCRIPT_TIFINAGH,
|
||||
ucp_Ugaritic = G_UNICODE_SCRIPT_UGARITIC,
|
||||
ucp_Yi = G_UNICODE_SCRIPT_YI,
|
||||
ucp_Balinese = G_UNICODE_SCRIPT_BALINESE, /* New for Unicode 5.0.0 */
|
||||
ucp_Cuneiform = G_UNICODE_SCRIPT_CUNEIFORM, /* New for Unicode 5.0.0 */
|
||||
ucp_Nko = G_UNICODE_SCRIPT_NKO, /* New for Unicode 5.0.0 */
|
||||
ucp_Phags_Pa = G_UNICODE_SCRIPT_PHAGS_PA, /* New for Unicode 5.0.0 */
|
||||
ucp_Phoenician = G_UNICODE_SCRIPT_PHOENICIAN, /* New for Unicode 5.0.0 */
|
||||
ucp_Carian = G_UNICODE_SCRIPT_CARIAN, /* New for Unicode 5.1 */
|
||||
ucp_Cham = G_UNICODE_SCRIPT_CHAM, /* New for Unicode 5.1 */
|
||||
ucp_Kayah_Li = G_UNICODE_SCRIPT_KAYAH_LI, /* New for Unicode 5.1 */
|
||||
ucp_Lepcha = G_UNICODE_SCRIPT_LEPCHA, /* New for Unicode 5.1 */
|
||||
ucp_Lycian = G_UNICODE_SCRIPT_LYCIAN, /* New for Unicode 5.1 */
|
||||
ucp_Lydian = G_UNICODE_SCRIPT_LYDIAN, /* New for Unicode 5.1 */
|
||||
ucp_Ol_Chiki = G_UNICODE_SCRIPT_OL_CHIKI, /* New for Unicode 5.1 */
|
||||
ucp_Rejang = G_UNICODE_SCRIPT_REJANG, /* New for Unicode 5.1 */
|
||||
ucp_Saurashtra = G_UNICODE_SCRIPT_SAURASHTRA, /* New for Unicode 5.1 */
|
||||
ucp_Sundanese = G_UNICODE_SCRIPT_SUNDANESE, /* New for Unicode 5.1 */
|
||||
ucp_Vai = G_UNICODE_SCRIPT_VAI /* New for Unicode 5.1 */
|
||||
ucp_Balinese = G_UNICODE_SCRIPT_BALINESE,
|
||||
ucp_Cuneiform = G_UNICODE_SCRIPT_CUNEIFORM,
|
||||
ucp_Nko = G_UNICODE_SCRIPT_NKO,
|
||||
ucp_Phags_Pa = G_UNICODE_SCRIPT_PHAGS_PA,
|
||||
ucp_Phoenician = G_UNICODE_SCRIPT_PHOENICIAN,
|
||||
ucp_Carian = G_UNICODE_SCRIPT_CARIAN,
|
||||
ucp_Cham = G_UNICODE_SCRIPT_CHAM,
|
||||
ucp_Kayah_Li = G_UNICODE_SCRIPT_KAYAH_LI,
|
||||
ucp_Lepcha = G_UNICODE_SCRIPT_LEPCHA,
|
||||
ucp_Lycian = G_UNICODE_SCRIPT_LYCIAN,
|
||||
ucp_Lydian = G_UNICODE_SCRIPT_LYDIAN,
|
||||
ucp_Ol_Chiki = G_UNICODE_SCRIPT_OL_CHIKI,
|
||||
ucp_Rejang = G_UNICODE_SCRIPT_REJANG,
|
||||
ucp_Saurashtra = G_UNICODE_SCRIPT_SAURASHTRA,
|
||||
ucp_Sundanese = G_UNICODE_SCRIPT_SUNDANESE,
|
||||
ucp_Vai = G_UNICODE_SCRIPT_VAI,
|
||||
ucp_Avestan = G_UNICODE_SCRIPT_AVESTAN,
|
||||
ucp_Bamum = G_UNICODE_SCRIPT_BAMUM,
|
||||
ucp_Egyptian_Hieroglyphs = G_UNICODE_SCRIPT_EGYPTIAN_HIEROGLYPHS,
|
||||
ucp_Imperial_Aramaic = G_UNICODE_SCRIPT_IMPERIAL_ARAMAIC,
|
||||
ucp_Inscriptional_Pahlavi = G_UNICODE_SCRIPT_INSCRIPTIONAL_PAHLAVI,
|
||||
ucp_Inscriptional_Parthian = G_UNICODE_SCRIPT_INSCRIPTIONAL_PARTHIAN,
|
||||
ucp_Javanese = G_UNICODE_SCRIPT_JAVANESE,
|
||||
ucp_Kaithi = G_UNICODE_SCRIPT_KAITHI,
|
||||
ucp_Lisu = G_UNICODE_SCRIPT_LISU,
|
||||
ucp_Meetei_Mayek = G_UNICODE_SCRIPT_MEETEI_MAYEK,
|
||||
ucp_Old_South_Arabian = G_UNICODE_SCRIPT_OLD_SOUTH_ARABIAN,
|
||||
ucp_Old_Turkic = G_UNICODE_SCRIPT_OLD_TURKISH,
|
||||
ucp_Samaritan = G_UNICODE_SCRIPT_SAMARITAN,
|
||||
ucp_Tai_Tham = G_UNICODE_SCRIPT_TAI_THAM,
|
||||
ucp_Tai_Viet = G_UNICODE_SCRIPT_TAI_VIET
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
/* End of ucp.h */
|
||||
|
||||
|
@ -1,92 +0,0 @@
|
||||
/*************************************************
|
||||
* Unicode Property Table handler *
|
||||
*************************************************/
|
||||
|
||||
#ifndef _UCPINTERNAL_H
|
||||
#define _UCPINTERNAL_H
|
||||
|
||||
/* Internal header file defining the layout of the bits in each pair of 32-bit
|
||||
words that form a data item in the table. */
|
||||
|
||||
typedef struct cnode {
|
||||
pcre_uint32 f0;
|
||||
pcre_uint32 f1;
|
||||
} cnode;
|
||||
|
||||
/* Things for the f0 field */
|
||||
|
||||
#define f0_scriptmask 0xff000000 /* Mask for script field */
|
||||
#define f0_scriptshift 24 /* Shift for script value */
|
||||
#define f0_rangeflag 0x00800000 /* Flag for a range item */
|
||||
#define f0_charmask 0x001fffff /* Mask for code point value */
|
||||
|
||||
/* Things for the f1 field */
|
||||
|
||||
#define f1_typemask 0xfc000000 /* Mask for char type field */
|
||||
#define f1_typeshift 26 /* Shift for the type field */
|
||||
#define f1_rangemask 0x0000ffff /* Mask for a range offset */
|
||||
#define f1_casemask 0x0000ffff /* Mask for a case offset */
|
||||
#define f1_caseneg 0xffff8000 /* Bits for negation */
|
||||
|
||||
/* The data consists of a vector of structures of type cnode. The two unsigned
|
||||
32-bit integers are used as follows:
|
||||
|
||||
(f0) (1) The most significant byte holds the script number. The numbers are
|
||||
defined by the enum in ucp.h.
|
||||
|
||||
(2) The 0x00800000 bit is set if this entry defines a range of characters.
|
||||
It is not set if this entry defines a single character
|
||||
|
||||
(3) The 0x00600000 bits are spare.
|
||||
|
||||
(4) The 0x001fffff bits contain the code point. No Unicode code point will
|
||||
ever be greater than 0x0010ffff, so this should be OK for ever.
|
||||
|
||||
(f1) (1) The 0xfc000000 bits contain the character type number. The numbers are
|
||||
defined by an enum in ucp.h.
|
||||
|
||||
(2) The 0x03ff0000 bits are spare.
|
||||
|
||||
(3) The 0x0000ffff bits contain EITHER the unsigned offset to the top of
|
||||
range if this entry defines a range, OR the *signed* offset to the
|
||||
character's "other case" partner if this entry defines a single
|
||||
character. There is no partner if the value is zero.
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
| script (8) |.|.|.| codepoint (21) || type (6) |.|.| spare (8) | offset (16) |
|
||||
-------------------------------------------------------------------------------
|
||||
| | | | |
|
||||
| | |-> spare | |-> spare
|
||||
| | |
|
||||
| |-> spare |-> spare
|
||||
|
|
||||
|-> range flag
|
||||
|
||||
The upper/lower casing information is set only for characters that come in
|
||||
pairs. The non-one-to-one mappings in the Unicode data are ignored.
|
||||
|
||||
When searching the data, proceed as follows:
|
||||
|
||||
(1) Set up for a binary chop search.
|
||||
|
||||
(2) If the top is not greater than the bottom, the character is not in the
|
||||
table. Its type must therefore be "Cn" ("Undefined").
|
||||
|
||||
(3) Find the middle vector element.
|
||||
|
||||
(4) Extract the code point and compare. If equal, we are done.
|
||||
|
||||
(5) If the test character is smaller, set the top to the current point, and
|
||||
goto (2).
|
||||
|
||||
(6) If the current entry defines a range, compute the last character by adding
|
||||
the offset, and see if the test character is within the range. If it is,
|
||||
we are done.
|
||||
|
||||
(7) Otherwise, set the bottom to one element past the current point and goto
|
||||
(2).
|
||||
*/
|
||||
|
||||
#endif /* _UCPINTERNAL_H */
|
||||
|
||||
/* End of ucpinternal.h */
|
Loading…
Reference in New Issue
Block a user