From a6e3eb6eced6b4f1d816a0ea73adb5081145730a Mon Sep 17 00:00:00 2001 From: Christian Persch Date: Fri, 8 Jun 2012 00:56:44 +0200 Subject: [PATCH] regex: Add NO_START_OPTIMIZE compile and match flags PCRE_NO_START_OPTIMIZE exists since PCRE 7.9, but was not usefully before since it only affects callout (which GRegex doesn't support) and backtracking control verbs which the last commit makes use of. --- glib/gregex.c | 67 +++++++++++++++++++++++++--------------------- glib/gregex.h | 39 +++++++++++++++------------ glib/tests/regex.c | 2 +- 3 files changed, 60 insertions(+), 48 deletions(-) diff --git a/glib/gregex.c b/glib/gregex.c index e69b2c0f5..282700a53 100644 --- a/glib/gregex.c +++ b/glib/gregex.c @@ -128,7 +128,8 @@ G_REGEX_NEWLINE_CRLF | \ G_REGEX_NEWLINE_ANYCRLF | \ G_REGEX_BSR_ANYCRLF | \ - G_REGEX_JAVASCRIPT_COMPAT) + G_REGEX_JAVASCRIPT_COMPAT | \ + G_REGEX_NO_START_OPTIMIZE) /* Mask of all GRegexCompileFlags values that are (not) passed trough to PCRE */ #define G_REGEX_COMPILE_PCRE_MASK (G_REGEX_COMPILE_MASK & ~G_REGEX_COMPILE_NONPCRE_MASK) @@ -136,20 +137,21 @@ G_REGEX_OPTIMIZE) /* Mask of all the possible values for GRegexMatchFlags. */ -#define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED | \ - G_REGEX_MATCH_NOTBOL | \ - G_REGEX_MATCH_NOTEOL | \ - G_REGEX_MATCH_NOTEMPTY | \ - G_REGEX_MATCH_PARTIAL | \ - G_REGEX_MATCH_NEWLINE_CR | \ - G_REGEX_MATCH_NEWLINE_LF | \ - G_REGEX_MATCH_NEWLINE_CRLF | \ - G_REGEX_MATCH_NEWLINE_ANY | \ - G_REGEX_MATCH_NEWLINE_ANYCRLF | \ - G_REGEX_MATCH_BSR_ANYCRLF | \ - G_REGEX_MATCH_BSR_ANY | \ - G_REGEX_MATCH_PARTIAL_SOFT | \ - G_REGEX_MATCH_PARTIAL_HARD | \ +#define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED | \ + G_REGEX_MATCH_NOTBOL | \ + G_REGEX_MATCH_NOTEOL | \ + G_REGEX_MATCH_NOTEMPTY | \ + G_REGEX_MATCH_PARTIAL | \ + G_REGEX_MATCH_NEWLINE_CR | \ + G_REGEX_MATCH_NEWLINE_LF | \ + G_REGEX_MATCH_NEWLINE_CRLF | \ + G_REGEX_MATCH_NEWLINE_ANY | \ + G_REGEX_MATCH_NEWLINE_ANYCRLF | \ + G_REGEX_MATCH_BSR_ANYCRLF | \ + G_REGEX_MATCH_BSR_ANY | \ + G_REGEX_MATCH_NO_START_OPTIMIZE | \ + G_REGEX_MATCH_PARTIAL_SOFT | \ + G_REGEX_MATCH_PARTIAL_HARD | \ G_REGEX_MATCH_NOTEMPTY_ATSTART) /* we rely on these flags having the same values */ @@ -169,22 +171,24 @@ G_STATIC_ASSERT (G_REGEX_NEWLINE_CRLF == PCRE_NEWLINE_CRLF); G_STATIC_ASSERT (G_REGEX_NEWLINE_ANYCRLF == PCRE_NEWLINE_ANYCRLF); G_STATIC_ASSERT (G_REGEX_BSR_ANYCRLF == PCRE_BSR_ANYCRLF); G_STATIC_ASSERT (G_REGEX_JAVASCRIPT_COMPAT == PCRE_JAVASCRIPT_COMPAT); +G_STATIC_ASSERT (G_REGEX_NO_START_OPTIMIZE == PCRE_NO_START_OPTIMIZE); -G_STATIC_ASSERT (G_REGEX_MATCH_ANCHORED == PCRE_ANCHORED); -G_STATIC_ASSERT (G_REGEX_MATCH_NOTBOL == PCRE_NOTBOL); -G_STATIC_ASSERT (G_REGEX_MATCH_NOTEOL == PCRE_NOTEOL); -G_STATIC_ASSERT (G_REGEX_MATCH_NOTEMPTY == PCRE_NOTEMPTY); -G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL == PCRE_PARTIAL); -G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_CR == PCRE_NEWLINE_CR); -G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_LF == PCRE_NEWLINE_LF); -G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_CRLF == PCRE_NEWLINE_CRLF); -G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_ANY == PCRE_NEWLINE_ANY); -G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_ANYCRLF == PCRE_NEWLINE_ANYCRLF); -G_STATIC_ASSERT (G_REGEX_MATCH_BSR_ANYCRLF == PCRE_BSR_ANYCRLF); -G_STATIC_ASSERT (G_REGEX_MATCH_BSR_ANY == PCRE_BSR_UNICODE); -G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL_SOFT == PCRE_PARTIAL_SOFT); -G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL_HARD == PCRE_PARTIAL_HARD); -G_STATIC_ASSERT (G_REGEX_MATCH_NOTEMPTY_ATSTART == PCRE_NOTEMPTY_ATSTART); +G_STATIC_ASSERT (G_REGEX_MATCH_ANCHORED == PCRE_ANCHORED); +G_STATIC_ASSERT (G_REGEX_MATCH_NOTBOL == PCRE_NOTBOL); +G_STATIC_ASSERT (G_REGEX_MATCH_NOTEOL == PCRE_NOTEOL); +G_STATIC_ASSERT (G_REGEX_MATCH_NOTEMPTY == PCRE_NOTEMPTY); +G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL == PCRE_PARTIAL); +G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_CR == PCRE_NEWLINE_CR); +G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_LF == PCRE_NEWLINE_LF); +G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_CRLF == PCRE_NEWLINE_CRLF); +G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_ANY == PCRE_NEWLINE_ANY); +G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_ANYCRLF == PCRE_NEWLINE_ANYCRLF); +G_STATIC_ASSERT (G_REGEX_MATCH_BSR_ANYCRLF == PCRE_BSR_ANYCRLF); +G_STATIC_ASSERT (G_REGEX_MATCH_BSR_ANY == PCRE_BSR_UNICODE); +G_STATIC_ASSERT (G_REGEX_MATCH_NO_START_OPTIMIZE == PCRE_NO_START_OPTIMIZE); +G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL_SOFT == PCRE_PARTIAL_SOFT); +G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL_HARD == PCRE_PARTIAL_HARD); +G_STATIC_ASSERT (G_REGEX_MATCH_NOTEMPTY_ATSTART == PCRE_NOTEMPTY_ATSTART); /* These PCRE flags are unused or not exposed publically in GRegexFlags, so * it should be ok to reuse them for different things. @@ -659,6 +663,9 @@ g_match_info_get_string (const GMatchInfo *match_info) * the argument of the last verb encountered in the whole matching * process. Otherwise, $NULL is returned. * + * See man:pcrepattern for more information on + * backtracking control verbs. + * * Returns: (transfer none): the mark, or %NULL * * Since: 2.34 diff --git a/glib/gregex.h b/glib/gregex.h index 694440603..0640aac4d 100644 --- a/glib/gregex.h +++ b/glib/gregex.h @@ -279,7 +279,8 @@ GQuark g_regex_error_quark (void); * G_REGEX_BSR_ANYCRLF: Usually any newline character or character sequence * is recognised. If this option is set, then "\R" only recognizes the newline * characters '\r', '\n' and '\r\n'. Since: 2.34 - * + * @G_REGEX_NO_START_OPTIMIZE: Disable some optimizations that will cause incorrect + * results for g_match_info_get_mark() when using backtracking control verbs. Since: 2.34 * * Flags specifying compile-time options. * @@ -306,7 +307,8 @@ typedef enum G_REGEX_NEWLINE_CRLF = G_REGEX_NEWLINE_CR | G_REGEX_NEWLINE_LF, G_REGEX_NEWLINE_ANYCRLF = G_REGEX_NEWLINE_CR | 1 << 22, G_REGEX_BSR_ANYCRLF = 1 << 23, - G_REGEX_JAVASCRIPT_COMPAT = 1 << 25 + G_REGEX_JAVASCRIPT_COMPAT = 1 << 25, + G_REGEX_NO_START_OPTIMIZE = 1 << 26 } GRegexCompileFlags; /** @@ -372,6 +374,8 @@ typedef enum * @G_REGEX_MATCH_NOTEMPTY_ATSTART: Like #G_REGEX_MATCH_NOTEMPTY, but only applied to * the start of the matched string. For anchored * patterns this can only happen for pattern containing "\K". Since: 2.34 + * @G_REGEX_MATCH_NO_START_OPTIMIZE: Disable some optimizations that will cause incorrect + * results for g_match_info_get_mark() when using backtracking control verbs. Since: 2.34 * * Flags specifying match-time options. * @@ -381,21 +385,22 @@ typedef enum * adding a new flag. */ typedef enum { - G_REGEX_MATCH_ANCHORED = 1 << 4, - G_REGEX_MATCH_NOTBOL = 1 << 7, - G_REGEX_MATCH_NOTEOL = 1 << 8, - G_REGEX_MATCH_NOTEMPTY = 1 << 10, - G_REGEX_MATCH_PARTIAL = 1 << 15, - G_REGEX_MATCH_NEWLINE_CR = 1 << 20, - G_REGEX_MATCH_NEWLINE_LF = 1 << 21, - G_REGEX_MATCH_NEWLINE_CRLF = G_REGEX_MATCH_NEWLINE_CR | G_REGEX_MATCH_NEWLINE_LF, - G_REGEX_MATCH_NEWLINE_ANY = 1 << 22, - G_REGEX_MATCH_NEWLINE_ANYCRLF = G_REGEX_MATCH_NEWLINE_CR | G_REGEX_MATCH_NEWLINE_ANY, - G_REGEX_MATCH_BSR_ANYCRLF = 1 << 23, - G_REGEX_MATCH_BSR_ANY = 1 << 24, - G_REGEX_MATCH_PARTIAL_SOFT = G_REGEX_MATCH_PARTIAL, - G_REGEX_MATCH_PARTIAL_HARD = 1 << 27, - G_REGEX_MATCH_NOTEMPTY_ATSTART = 1 << 28 + G_REGEX_MATCH_ANCHORED = 1 << 4, + G_REGEX_MATCH_NOTBOL = 1 << 7, + G_REGEX_MATCH_NOTEOL = 1 << 8, + G_REGEX_MATCH_NOTEMPTY = 1 << 10, + G_REGEX_MATCH_PARTIAL = 1 << 15, + G_REGEX_MATCH_NEWLINE_CR = 1 << 20, + G_REGEX_MATCH_NEWLINE_LF = 1 << 21, + G_REGEX_MATCH_NEWLINE_CRLF = G_REGEX_MATCH_NEWLINE_CR | G_REGEX_MATCH_NEWLINE_LF, + G_REGEX_MATCH_NEWLINE_ANY = 1 << 22, + G_REGEX_MATCH_NEWLINE_ANYCRLF = G_REGEX_MATCH_NEWLINE_CR | G_REGEX_MATCH_NEWLINE_ANY, + G_REGEX_MATCH_BSR_ANYCRLF = 1 << 23, + G_REGEX_MATCH_BSR_ANY = 1 << 24, + G_REGEX_MATCH_NO_START_OPTIMIZE = 1 << 26, + G_REGEX_MATCH_PARTIAL_SOFT = G_REGEX_MATCH_PARTIAL, + G_REGEX_MATCH_PARTIAL_HARD = 1 << 27, + G_REGEX_MATCH_NOTEMPTY_ATSTART = 1 << 28 } GRegexMatchFlags; /** diff --git a/glib/tests/regex.c b/glib/tests/regex.c index ed5ab8060..54cc50f20 100644 --- a/glib/tests/regex.c +++ b/glib/tests/regex.c @@ -2146,7 +2146,7 @@ main (int argc, char *argv[]) TEST_NEW_CHECK_FLAGS ("(*ANYCRLF)a", 0, 0, G_REGEX_NEWLINE_ANYCRLF, 0); TEST_NEW_CHECK_FLAGS ("(*BSR_ANYCRLF)a", 0, 0, G_REGEX_BSR_ANYCRLF, 0); TEST_NEW_CHECK_FLAGS ("(*BSR_UNICODE)a", 0, 0, 0 /* this is the default in GRegex */, 0); - TEST_NEW_CHECK_FLAGS ("(*NO_START_OPT)a", 0, 0, 0 /* not exposed in GRegex */, 0); + TEST_NEW_CHECK_FLAGS ("(*NO_START_OPT)", 0, 0, G_REGEX_NO_START_OPTIMIZE, 0); /* TEST_NEW_FAIL(pattern, compile_opts, expected_error) */ TEST_NEW_FAIL("(", 0, G_REGEX_ERROR_UNMATCHED_PARENTHESIS);