Add simple string "search" function

Add a pair of functions to make it easier to do simple string matching. This will be useful for use with things like GtkSearchBar and will also be the basis of the searching done by the (soon to appear) g_desktop_app_info_search() https://bugzilla.gnome.org/show_bug.cgi?id=709753
2025-11-01 08:52:18 +01:00 · 2013-10-14 14:37:10 -04:00
parent 4c510801cf
commit 38dfce5992
3 changed files with 251 additions and 0 deletions
--- a/docs/reference/glib/glib-sections.txt
+++ b/docs/reference/glib/glib-sections.txt
@@ -1282,6 +1282,8 @@ g_strrstr_len
 g_str_has_prefix
 g_str_has_suffix
 g_strcmp0
+g_str_tokenize_and_fold
+g_str_match_string

 <SUBSECTION>
 g_strlcpy
--- a/glib/gstrfuncs.c
+++ b/glib/gstrfuncs.c
@@ -2853,3 +2853,242 @@ g_strv_length (gchar **str_array)

  return i;
 }
+
+static void
+index_add_folded (GPtrArray   *array,
+                  const gchar *start,
+                  const gchar *end)
+{
+  gchar *normal;
+
+  normal = g_utf8_normalize (start, end - start, G_NORMALIZE_ALL_COMPOSE);
+
+  /* TODO: Invent time machine.  Converse with Mustafa Ataturk... */
+  if (strstr (normal, "ı") || strstr (normal, "İ"))
+    {
+      gchar *s = normal;
+      GString *tmp;
+
+      tmp = g_string_new (NULL);
+
+      while (*s)
+        {
+          gchar *i, *I, *e;
+
+          i = strstr (s, "ı");
+          I = strstr (s, "İ");
+
+          if (!i && !I)
+            break;
+          else if (i && !I)
+            e = i;
+          else if (I && !i)
+            e = I;
+          else if (i < I)
+            e = i;
+          else
+            e = I;
+
+          g_string_append_len (tmp, s, e - s);
+          g_string_append_c (tmp, 'i');
+          s = g_utf8_next_char (e);
+        }
+
+      g_string_append (tmp, s);
+      g_free (normal);
+      normal = g_string_free (tmp, FALSE);
+    }
+
+  g_ptr_array_add (array, g_utf8_casefold (normal, -1));
+  g_free (normal);
+}
+
+static gchar **
+split_words (const gchar *value)
+{
+  const gchar *start = NULL;
+  GPtrArray *result;
+  const gchar *s;
+
+  result = g_ptr_array_new ();
+
+  for (s = value; *s; s = g_utf8_next_char (s))
+    {
+      gunichar c = g_utf8_get_char (s);
+
+      if (start == NULL)
+        {
+          if (g_unichar_isalnum (c) || g_unichar_ismark (c))
+            start = s;
+        }
+      else
+        {
+          if (!g_unichar_isalnum (c) && !g_unichar_ismark (c))
+            {
+              index_add_folded (result, start, s);
+              start = NULL;
+            }
+        }
+    }
+
+  if (start)
+    index_add_folded (result, start, s);
+
+  g_ptr_array_add (result, NULL);
+
+  return (gchar **) g_ptr_array_free (result, FALSE);
+}
+
+/**
+ * g_str_tokenize_and_fold:
+ * @string: a string
+ * @translit_locale: (allow-none): the language code (like 'de' or
+ *   'en_GB') from which @string originates
+ * @ascii_alternates: (out) (transfer full) (array zero-terminated=1): a
+ *   return location for ASCII alternates
+ *
+ * Tokenises @string and performs folding on each token.
+ *
+ * A token is a non-empty sequence of alphanumeric characters in the
+ * source string, separated by non-alphanumeric characters.  An
+ * "alphanumeric" character for this purpose is one that matches
+ * g_unichar_isalnum() or g_unichar_ismark().
+ *
+ * Each token is then (Unicode) normalised and case-folded.  If
+ * @ascii_alternates is non-%NULL and some of the returned tokens
+ * contain non-ASCII characters, ASCII alternatives will be generated.
+ *
+ * The number of ASCII alternatives that are generated and the method
+ * for doing so is unspecified, but @translit_locale (if specified) may
+ * improve the transliteration if the language of the source string is
+ * known.
+ *
+ * Returns: the folded tokens
+ *
+ * Since: 2.40
+ **/
+gchar **
+g_str_tokenize_and_fold (const gchar   *string,
+                         const gchar   *translit_locale,
+                         gchar       ***ascii_alternates)
+{
+  gchar **result;
+
+  if (ascii_alternates && g_str_is_ascii (string))
+    {
+      *ascii_alternates = g_new0 (gchar *, 0 + 1);
+      ascii_alternates = NULL;
+    }
+
+  result = split_words (string);
+
+  /* TODO: proper iconv transliteration (locale-dependent) */
+  if (ascii_alternates)
+    {
+      gint i, j, n;
+
+      n = g_strv_length (result);
+      *ascii_alternates = g_new (gchar *, n + 1);
+      j = 0;
+
+      for (i = 0; i < n; i++)
+        {
+          if (!g_str_is_ascii (result[i]))
+            {
+              gchar *decomposed;
+              gchar *ascii;
+              gint k = 0;
+              gint l = 0;
+
+              decomposed = g_utf8_normalize (result[i], -1, G_NORMALIZE_ALL);
+              ascii = g_malloc (strlen (decomposed) + 1);
+
+              for (k = 0; decomposed[k]; k++)
+                if (~decomposed[k] & 0x80)
+                  ascii[l++] = decomposed[k];
+              ascii[l] = '\0';
+
+              (*ascii_alternates)[j++] = ascii;
+              g_free (decomposed);
+            }
+        }
+
+      (*ascii_alternates)[j] = NULL;
+    }
+
+  return result;
+}
+
+/**
+ * g_search_match_string:
+ * @search_term: the search term from the user
+ * @potential_hit: the text that may be a hit
+ * @accept_alternates: %TRUE to accept ASCII alternates
+ *
+ * Checks if a search conducted for @search_term should match
+ * @potential_hit.
+ *
+ * This function calls g_search_tokenize_and_fold_string() on both
+ * @search_term and @potential_hit.  ASCII alternates are never taken
+ * for @search_term but will be taken for @potential_hit according to
+ * the value of @accept_alternates.
+ *
+ * A hit occurs when each folded token in @search_term is a prefix of a
+ * folded token from @potential_hit.
+ *
+ * Depending on how you're performing the search, it will typically be
+ * faster to call g_search_tokenize_and_fold_string() on each string in
+ * your corpus and build an index on the returned folded tokens, then
+ * call g_search_tokenize_and_fold_string() on the search term and
+ * perform lookups into that index.
+ *
+ * As some examples, searching for "fred" would match the potential hit
+ * "Smith, Fred" and also "Frédéric".  Searching for "Fréd" would match
+ * "Frédéric" but not "Frederic" (due to the one-directional nature of
+ * accent matching).  Searching "fo" would match "Foo" and "Bar Foo
+ * Baz", but not "SFO" (because no word as "fo" as a prefix).
+ *
+ * Returns: %TRUE if @potential_hit is a hit
+ *
+ * Since: 2.40
+ **/
+gboolean
+g_str_match_string (const gchar *search_term,
+                    const gchar *potential_hit,
+                    gboolean     accept_alternates)
+{
+  gchar **alternates = NULL;
+  gchar **term_tokens;
+  gchar **hit_tokens;
+  gboolean matched;
+  gint i, j;
+
+  term_tokens = g_str_tokenize_and_fold (search_term, NULL, NULL);
+  hit_tokens = g_str_tokenize_and_fold (potential_hit, NULL, accept_alternates ? &alternates : NULL);
+
+  matched = TRUE;
+
+  for (i = 0; term_tokens[i]; i++)
+    {
+      for (j = 0; hit_tokens[j]; j++)
+        if (g_str_has_prefix (hit_tokens[j], term_tokens[i]))
+          goto one_matched;
+
+      if (accept_alternates)
+        for (j = 0; alternates[j]; j++)
+          if (g_str_has_prefix (alternates[j], term_tokens[i]))
+            goto one_matched;
+
+      matched = FALSE;
+      break;
+
+one_matched:
+      continue;
+    }
+
+  g_strfreev (term_tokens);
+  g_strfreev (hit_tokens);
+  g_strfreev (alternates);
+
+  return matched;
+}
--- a/glib/gstrfuncs.h
+++ b/glib/gstrfuncs.h
@@ -290,6 +290,16 @@ GLIB_AVAILABLE_IN_ALL
 gchar*                g_stpcpy         (gchar        *dest,
                                        const char   *src);

+GLIB_AVAILABLE_IN_2_40
+gchar **                g_str_tokenize_and_fold                         (const gchar   *string,
+                                                                         const gchar   *translit_locale,
+                                                                         gchar       ***ascii_alternates);
+
+GLIB_AVAILABLE_IN_2_40
+gboolean                g_str_match_string                              (const gchar   *search_term,
+                                                                         const gchar   *potential_hit,
+                                                                         gboolean       accept_alternates);
+
 G_END_DECLS

 #endif /* __G_STRFUNCS_H__ */