gregex: Fix return from g_match_info_fetch() for unmatched subpatterns

If there were more subpatterns in the regex than matches (which can
happen if one or more of the subpatterns are optional),
`g_match_info_fetch()` was erroneously returning `NULL` rather than the
empty string. It should only return `NULL` when the `match_num`
specifies a subpattern which doesn’t exist in the regex.

This is complicated slightly by the fact that when using
`g_regex_match_all()`, more matches can be returned than there are
subpatterns, due to one or more subpatterns matching multiple times at
different offsets in the string.

This includes a fix for a unit test which was erroneously checking the
broken behaviour.

Thanks to Allison Karlitskaya for the minimal reproducer.

Signed-off-by: Philip Withnall <pwithnall@endlessos.org>

Fixes: #229
This commit is contained in:
Philip Withnall 2020-11-14 14:08:30 +00:00
parent 601ef3b6be
commit b052620398
2 changed files with 25 additions and 10 deletions

View File

@ -35,6 +35,7 @@
#include "gmessages.h"
#include "gstrfuncs.h"
#include "gatomic.h"
#include "gtestutils.h"
#include "gthread.h"
/**
@ -206,7 +207,8 @@ struct _GMatchInfo
gint ref_count; /* the ref count (atomic) */
GRegex *regex; /* the regex */
GRegexMatchFlags match_opts; /* options used at match time on the regex */
gint matches; /* number of matching sub patterns */
gint matches; /* number of matching sub patterns, guaranteed to be <= (n_subpatterns + 1) if doing a single match (rather than matching all) */
gint n_subpatterns; /* total number of sub patterns in the regex */
gint pos; /* position in the string where last match left off */
gint n_offsets; /* number of offsets */
gint *offsets; /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */
@ -574,6 +576,9 @@ match_info_new (const GRegex *regex,
match_info->pos = start_position;
match_info->match_opts = match_options;
pcre_fullinfo (regex->pcre_re, regex->extra,
PCRE_INFO_CAPTURECOUNT, &match_info->n_subpatterns);
if (is_dfa)
{
/* These values should be enough for most cases, if they are not
@ -584,10 +589,7 @@ match_info_new (const GRegex *regex,
}
else
{
gint capture_count;
pcre_fullinfo (regex->pcre_re, regex->extra,
PCRE_INFO_CAPTURECOUNT, &capture_count);
match_info->n_offsets = (capture_count + 1) * 3;
match_info->n_offsets = (match_info->n_subpatterns + 1) * 3;
}
match_info->offsets = g_new0 (gint, match_info->n_offsets);
@ -768,6 +770,8 @@ g_match_info_next (GMatchInfo *match_info,
match_info->pos = match_info->offsets[1];
}
g_assert (match_info->matches <= match_info->n_subpatterns + 1);
/* it's possible to get two identical matches when we are matching
* empty strings, for instance if the pattern is "(?=[A-Z0-9])" and
* the string is "RegExTest" we have:
@ -1044,16 +1048,21 @@ g_match_info_fetch_pos (const GMatchInfo *match_info,
g_return_val_if_fail (match_info != NULL, FALSE);
g_return_val_if_fail (match_num >= 0, FALSE);
/* check whether there was an error */
if (match_info->matches < 0)
return FALSE;
/* make sure the sub expression number they're requesting is less than
* the total number of sub expressions that were matched. */
if (match_num >= match_info->matches)
* the total number of sub expressions in the regex. When matching all
* (g_regex_match_all()), also compare against the number of matches */
if (match_num >= MAX (match_info->n_subpatterns + 1, match_info->matches))
return FALSE;
if (start_pos != NULL)
*start_pos = match_info->offsets[2 * match_num];
*start_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num] : -1;
if (end_pos != NULL)
*end_pos = match_info->offsets[2 * match_num + 1];
*end_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num + 1] : -1;
return TRUE;
}
@ -1989,6 +1998,9 @@ g_regex_match_all_full (const GRegex *regex,
pcre_free (pcre_re);
#endif
/* dont assert that (info->matches <= info->n_subpatterns + 1) as that only
* holds true for a single match, rather than matching all */
/* set info->pos to -1 so that a call to g_match_info_next() fails. */
info->pos = -1;
retval = info->matches >= 0;

View File

@ -2557,13 +2557,16 @@ main (int argc, char *argv[])
TEST_SUB_PATTERN("(a)?(b)", "b", 0, 0, "b", 0, 1);
TEST_SUB_PATTERN("(a)?(b)", "b", 0, 1, "", -1, -1);
TEST_SUB_PATTERN("(a)?(b)", "b", 0, 2, "b", 0, 1);
TEST_SUB_PATTERN("(a)?b", "b", 0, 0, "b", 0, 1);
TEST_SUB_PATTERN("(a)?b", "b", 0, 1, "", -1, -1);
TEST_SUB_PATTERN("(a)?b", "b", 0, 2, NULL, UNTOUCHED, UNTOUCHED);
/* TEST_NAMED_SUB_PATTERN(pattern, string, start_position, sub_name,
* expected_sub, expected_start, expected_end) */
TEST_NAMED_SUB_PATTERN("a(?P<A>.)(?P<B>.)?", "ab", 0, "A", "b", 1, 2);
TEST_NAMED_SUB_PATTERN("a(?P<A>.)(?P<B>.)?", "aab", 1, "A", "b", 2, 3);
TEST_NAMED_SUB_PATTERN("a(?P<A>.)(?P<B>.)?", EURO "ab", 0, "A", "b", 4, 5);
TEST_NAMED_SUB_PATTERN("a(?P<A>.)(?P<B>.)?", EURO "ab", 0, "B", NULL, UNTOUCHED, UNTOUCHED);
TEST_NAMED_SUB_PATTERN("a(?P<A>.)(?P<B>.)?", EURO "ab", 0, "B", "", -1, -1);
TEST_NAMED_SUB_PATTERN("a(?P<A>.)(?P<B>.)?", EURO "ab", 0, "C", NULL, UNTOUCHED, UNTOUCHED);
TEST_NAMED_SUB_PATTERN("a(?P<A>.)(?P<B>.)?", "a" EGRAVE "x", 0, "A", EGRAVE, 1, 3);
TEST_NAMED_SUB_PATTERN("a(?P<A>.)(?P<B>.)?", "a" EGRAVE "x", 0, "B", "x", 3, 4);