mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2025-01-13 07:56:17 +01:00
regex: if PCRE is 8.34 or later, disable auto-possessification for DFA
Normally, recent PCRE behaves as if certain patterns were replaced by a more "possessive" pattern that gives the same answer for normal regex matching, but is more efficient. However, the modified pattern produces fewer results under DFA. If we want the full set of results we have to apply PCRE_NO_AUTO_POSSESS, and that's a compile-time flag. This currently only affects a system PCRE, but would also work fine for an internal PCRE 8.34 or later if the embedded copy is updated. Bug: https://bugzilla.gnome.org/show_bug.cgi?id=733325 Reviewed-by: Christian Persch <chpe@gnome.org>
This commit is contained in:
parent
f45ceb838d
commit
bf181a3ac7
128
glib/gregex.c
128
glib/gregex.c
@ -1267,6 +1267,15 @@ g_regex_unref (GRegex *regex)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* @match_options: (inout) (optional):
|
||||
*/
|
||||
static pcre *regex_compile (const gchar *pattern,
|
||||
GRegexCompileFlags compile_options,
|
||||
GRegexCompileFlags *compile_options_out,
|
||||
GRegexMatchFlags *match_options,
|
||||
GError **error);
|
||||
|
||||
/**
|
||||
* g_regex_new:
|
||||
* @pattern: the regular expression
|
||||
@ -1291,12 +1300,8 @@ g_regex_new (const gchar *pattern,
|
||||
GRegex *regex;
|
||||
pcre *re;
|
||||
const gchar *errmsg;
|
||||
gint erroffset;
|
||||
gint errcode;
|
||||
gboolean optimize = FALSE;
|
||||
static volatile gsize initialised = 0;
|
||||
unsigned long int pcre_compile_options;
|
||||
GRegexCompileFlags nonpcre_compile_options;
|
||||
|
||||
g_return_val_if_fail (pattern != NULL, NULL);
|
||||
g_return_val_if_fail (error == NULL || *error == NULL, NULL);
|
||||
@ -1325,13 +1330,61 @@ g_regex_new (const gchar *pattern,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK;
|
||||
|
||||
/* G_REGEX_OPTIMIZE has the same numeric value of PCRE_NO_UTF8_CHECK,
|
||||
* as we do not need to wrap PCRE_NO_UTF8_CHECK. */
|
||||
if (compile_options & G_REGEX_OPTIMIZE)
|
||||
optimize = TRUE;
|
||||
|
||||
re = regex_compile (pattern, compile_options, &compile_options,
|
||||
&match_options, error);
|
||||
|
||||
if (re == NULL)
|
||||
return NULL;
|
||||
|
||||
regex = g_new0 (GRegex, 1);
|
||||
regex->ref_count = 1;
|
||||
regex->pattern = g_strdup (pattern);
|
||||
regex->pcre_re = re;
|
||||
regex->compile_opts = compile_options;
|
||||
regex->match_opts = match_options;
|
||||
|
||||
if (optimize)
|
||||
{
|
||||
regex->extra = pcre_study (regex->pcre_re, 0, &errmsg);
|
||||
if (errmsg != NULL)
|
||||
{
|
||||
GError *tmp_error = g_error_new (G_REGEX_ERROR,
|
||||
G_REGEX_ERROR_OPTIMIZE,
|
||||
_("Error while optimizing "
|
||||
"regular expression %s: %s"),
|
||||
regex->pattern,
|
||||
errmsg);
|
||||
g_propagate_error (error, tmp_error);
|
||||
|
||||
g_regex_unref (regex);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return regex;
|
||||
}
|
||||
|
||||
static pcre *
|
||||
regex_compile (const gchar *pattern,
|
||||
GRegexCompileFlags compile_options,
|
||||
GRegexCompileFlags *compile_options_out,
|
||||
GRegexMatchFlags *match_options,
|
||||
GError **error)
|
||||
{
|
||||
pcre *re;
|
||||
const gchar *errmsg;
|
||||
gint erroffset;
|
||||
gint errcode;
|
||||
GRegexCompileFlags nonpcre_compile_options;
|
||||
unsigned long int pcre_compile_options;
|
||||
|
||||
nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK;
|
||||
|
||||
/* In GRegex the string are, by default, UTF-8 encoded. PCRE
|
||||
* instead uses UTF-8 only if required with PCRE_UTF8. */
|
||||
if (compile_options & G_REGEX_RAW)
|
||||
@ -1343,7 +1396,9 @@ g_regex_new (const gchar *pattern,
|
||||
{
|
||||
/* enable utf-8 */
|
||||
compile_options |= PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
|
||||
match_options |= PCRE_NO_UTF8_CHECK;
|
||||
|
||||
if (match_options != NULL)
|
||||
*match_options |= PCRE_NO_UTF8_CHECK;
|
||||
}
|
||||
|
||||
/* PCRE_NEWLINE_ANY is the default for the internal PCRE but
|
||||
@ -1408,32 +1463,10 @@ g_regex_new (const gchar *pattern,
|
||||
compile_options |= G_REGEX_DUPNAMES;
|
||||
}
|
||||
|
||||
regex = g_new0 (GRegex, 1);
|
||||
regex->ref_count = 1;
|
||||
regex->pattern = g_strdup (pattern);
|
||||
regex->pcre_re = re;
|
||||
regex->compile_opts = compile_options;
|
||||
regex->match_opts = match_options;
|
||||
if (compile_options_out != 0)
|
||||
*compile_options_out = compile_options;
|
||||
|
||||
if (optimize)
|
||||
{
|
||||
regex->extra = pcre_study (regex->pcre_re, 0, &errmsg);
|
||||
if (errmsg != NULL)
|
||||
{
|
||||
GError *tmp_error = g_error_new (G_REGEX_ERROR,
|
||||
G_REGEX_ERROR_OPTIMIZE,
|
||||
_("Error while optimizing "
|
||||
"regular expression %s: %s"),
|
||||
regex->pattern,
|
||||
errmsg);
|
||||
g_propagate_error (error, tmp_error);
|
||||
|
||||
g_regex_unref (regex);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return regex;
|
||||
return re;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1873,6 +1906,8 @@ g_regex_match_all_full (const GRegex *regex,
|
||||
{
|
||||
GMatchInfo *info;
|
||||
gboolean done;
|
||||
pcre *pcre_re;
|
||||
pcre_extra *extra;
|
||||
|
||||
g_return_val_if_fail (regex != NULL, FALSE);
|
||||
g_return_val_if_fail (string != NULL, FALSE);
|
||||
@ -1880,6 +1915,29 @@ g_regex_match_all_full (const GRegex *regex,
|
||||
g_return_val_if_fail (error == NULL || *error == NULL, FALSE);
|
||||
g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);
|
||||
|
||||
#ifdef PCRE_NO_AUTO_POSSESS
|
||||
/* For PCRE >= 8.34 we need to turn off PCRE_NO_AUTO_POSSESS, which
|
||||
* is an optimization for normal regex matching, but results in omitting
|
||||
* some shorter matches here, and an observable behaviour change.
|
||||
*
|
||||
* DFA matching is rather niche, and very rarely used according to
|
||||
* codesearch.debian.net, so don't bother caching the recompiled RE. */
|
||||
pcre_re = regex_compile (regex->pattern,
|
||||
regex->compile_opts | PCRE_NO_AUTO_POSSESS,
|
||||
NULL, NULL, error);
|
||||
|
||||
if (pcre_re == NULL)
|
||||
return FALSE;
|
||||
|
||||
/* Not bothering to cache the optimization data either, with similar
|
||||
* reasoning */
|
||||
extra = NULL;
|
||||
#else
|
||||
/* For PCRE < 8.33 the precompiled regex is fine. */
|
||||
pcre_re = regex->pcre_re;
|
||||
extra = regex->extra;
|
||||
#endif
|
||||
|
||||
info = match_info_new (regex, string, string_len, start_position,
|
||||
match_options, TRUE);
|
||||
|
||||
@ -1887,7 +1945,7 @@ g_regex_match_all_full (const GRegex *regex,
|
||||
while (!done)
|
||||
{
|
||||
done = TRUE;
|
||||
info->matches = pcre_dfa_exec (regex->pcre_re, regex->extra,
|
||||
info->matches = pcre_dfa_exec (pcre_re, extra,
|
||||
info->string, info->string_len,
|
||||
info->pos,
|
||||
regex->match_opts | match_options,
|
||||
@ -1917,6 +1975,10 @@ g_regex_match_all_full (const GRegex *regex,
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef PCRE_NO_AUTO_POSSESS
|
||||
pcre_free (pcre_re);
|
||||
#endif
|
||||
|
||||
/* set info->pos to -1 so that a call to g_match_info_next() fails. */
|
||||
info->pos = -1;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user