Add simple string "search" function

Add a pair of functions to make it easier to do simple string matching.

This will be useful for use with things like GtkSearchBar and will also
be the basis of the searching done by the (soon to appear)
g_desktop_app_info_search()

https://bugzilla.gnome.org/show_bug.cgi?id=709753
This commit is contained in:
Ryan Lortie 2013-10-14 14:37:10 -04:00
parent 4c510801cf
commit 38dfce5992
3 changed files with 251 additions and 0 deletions

View File

@ -1282,6 +1282,8 @@ g_strrstr_len
g_str_has_prefix
g_str_has_suffix
g_strcmp0
g_str_tokenize_and_fold
g_str_match_string
<SUBSECTION>
g_strlcpy

View File

@ -2853,3 +2853,242 @@ g_strv_length (gchar **str_array)
return i;
}
static void
index_add_folded (GPtrArray *array,
const gchar *start,
const gchar *end)
{
gchar *normal;
normal = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL_COMPOSE);
/* TODO: Invent time machine. Converse with Mustafa Ataturk... */
if (strstr (normal, "ı") || strstr (normal, "İ"))
{
gchar *s = normal;
GString *tmp;
tmp = g_string_new (NULL);
while (*s)
{
gchar *i, *I, *e;
i = strstr (s, "ı");
I = strstr (s, "İ");
if (!i && !I)
break;
else if (i && !I)
e = i;
else if (I && !i)
e = I;
else if (i < I)
e = i;
else
e = I;
g_string_append_len (tmp, s, e - s);
g_string_append_c (tmp, 'i');
s = g_utf8_next_char (e);
}
g_string_append (tmp, s);
g_free (normal);
normal = g_string_free (tmp, FALSE);
}
g_ptr_array_add (array, g_utf8_casefold (normal, -1));
g_free (normal);
}
static gchar **
split_words (const gchar *value)
{
const gchar *start = NULL;
GPtrArray *result;
const gchar *s;
result = g_ptr_array_new ();
for (s = value; *s; s = g_utf8_next_char (s))
{
gunichar c = g_utf8_get_char (s);
if (start == NULL)
{
if (g_unichar_isalnum (c) || g_unichar_ismark (c))
start = s;
}
else
{
if (!g_unichar_isalnum (c) && !g_unichar_ismark (c))
{
index_add_folded (result, start, s);
start = NULL;
}
}
}
if (start)
index_add_folded (result, start, s);
g_ptr_array_add (result, NULL);
return (gchar **) g_ptr_array_free (result, FALSE);
}
/**
* g_str_tokenize_and_fold:
* @string: a string
* @translit_locale: (allow-none): the language code (like 'de' or
* 'en_GB') from which @string originates
* @ascii_alternates: (out) (transfer full) (array zero-terminated=1): a
* return location for ASCII alternates
*
* Tokenises @string and performs folding on each token.
*
* A token is a non-empty sequence of alphanumeric characters in the
* source string, separated by non-alphanumeric characters. An
* "alphanumeric" character for this purpose is one that matches
* g_unichar_isalnum() or g_unichar_ismark().
*
* Each token is then (Unicode) normalised and case-folded. If
* @ascii_alternates is non-%NULL and some of the returned tokens
* contain non-ASCII characters, ASCII alternatives will be generated.
*
* The number of ASCII alternatives that are generated and the method
* for doing so is unspecified, but @translit_locale (if specified) may
* improve the transliteration if the language of the source string is
* known.
*
* Returns: the folded tokens
*
* Since: 2.40
**/
gchar **
g_str_tokenize_and_fold (const gchar *string,
const gchar *translit_locale,
gchar ***ascii_alternates)
{
gchar **result;
if (ascii_alternates && g_str_is_ascii (string))
{
*ascii_alternates = g_new0 (gchar *, 0 + 1);
ascii_alternates = NULL;
}
result = split_words (string);
/* TODO: proper iconv transliteration (locale-dependent) */
if (ascii_alternates)
{
gint i, j, n;
n = g_strv_length (result);
*ascii_alternates = g_new (gchar *, n + 1);
j = 0;
for (i = 0; i < n; i++)
{
if (!g_str_is_ascii (result[i]))
{
gchar *decomposed;
gchar *ascii;
gint k = 0;
gint l = 0;
decomposed = g_utf8_normalize (result[i], -1, G_NORMALIZE_ALL);
ascii = g_malloc (strlen (decomposed) + 1);
for (k = 0; decomposed[k]; k++)
if (~decomposed[k] & 0x80)
ascii[l++] = decomposed[k];
ascii[l] = '\0';
(*ascii_alternates)[j++] = ascii;
g_free (decomposed);
}
}
(*ascii_alternates)[j] = NULL;
}
return result;
}
/**
* g_search_match_string:
* @search_term: the search term from the user
* @potential_hit: the text that may be a hit
* @accept_alternates: %TRUE to accept ASCII alternates
*
* Checks if a search conducted for @search_term should match
* @potential_hit.
*
* This function calls g_search_tokenize_and_fold_string() on both
* @search_term and @potential_hit. ASCII alternates are never taken
* for @search_term but will be taken for @potential_hit according to
* the value of @accept_alternates.
*
* A hit occurs when each folded token in @search_term is a prefix of a
* folded token from @potential_hit.
*
* Depending on how you're performing the search, it will typically be
* faster to call g_search_tokenize_and_fold_string() on each string in
* your corpus and build an index on the returned folded tokens, then
* call g_search_tokenize_and_fold_string() on the search term and
* perform lookups into that index.
*
* As some examples, searching for "fred" would match the potential hit
* "Smith, Fred" and also "Frédéric". Searching for "Fréd" would match
* "Frédéric" but not "Frederic" (due to the one-directional nature of
* accent matching). Searching "fo" would match "Foo" and "Bar Foo
* Baz", but not "SFO" (because no word as "fo" as a prefix).
*
* Returns: %TRUE if @potential_hit is a hit
*
* Since: 2.40
**/
gboolean
g_str_match_string (const gchar *search_term,
const gchar *potential_hit,
gboolean accept_alternates)
{
gchar **alternates = NULL;
gchar **term_tokens;
gchar **hit_tokens;
gboolean matched;
gint i, j;
term_tokens = g_str_tokenize_and_fold (search_term, NULL, NULL);
hit_tokens = g_str_tokenize_and_fold (potential_hit, NULL, accept_alternates ? &alternates : NULL);
matched = TRUE;
for (i = 0; term_tokens[i]; i++)
{
for (j = 0; hit_tokens[j]; j++)
if (g_str_has_prefix (hit_tokens[j], term_tokens[i]))
goto one_matched;
if (accept_alternates)
for (j = 0; alternates[j]; j++)
if (g_str_has_prefix (alternates[j], term_tokens[i]))
goto one_matched;
matched = FALSE;
break;
one_matched:
continue;
}
g_strfreev (term_tokens);
g_strfreev (hit_tokens);
g_strfreev (alternates);
return matched;
}

View File

@ -290,6 +290,16 @@ GLIB_AVAILABLE_IN_ALL
gchar* g_stpcpy (gchar *dest,
const char *src);
GLIB_AVAILABLE_IN_2_40
gchar ** g_str_tokenize_and_fold (const gchar *string,
const gchar *translit_locale,
gchar ***ascii_alternates);
GLIB_AVAILABLE_IN_2_40
gboolean g_str_match_string (const gchar *search_term,
const gchar *potential_hit,
gboolean accept_alternates);
G_END_DECLS
#endif /* __G_STRFUNCS_H__ */