Merge branch '1323-aarch64-mem-barrier' into 'master'

gthread: Use C11-style memory consistency to speed up g_once() Closes #1323 See merge request GNOME/glib!1364
2025-04-17 21:08:49 +02:00 · 2020-05-19 17:22:41 +00:00 · 2020-05-19 17:22:41 +00:00 · 1d61c97761
commit 1d61c97761
parent e4c3af9d4e c1d7097d0a
6 changed files with 113 additions and 29 deletions
--- a/glib/glibconfig.h.in
+++ b/glib/glibconfig.h.in
@ -147,7 +147,6 @@ typedef unsigned @glib_intptr_type_define@ guintptr;
 #define G_THREADS_ENABLED
 #define G_THREADS_IMPL_@g_threads_impl_def@

-#mesondefine G_ATOMIC_OP_MEMORY_BARRIER_NEEDED
 #mesondefine G_ATOMIC_LOCK_FREE

 #define GINT16_TO_@g_bs_native@(val)	((gint16) (val))
--- a/glib/gthread.c
+++ b/glib/gthread.c
@ -630,13 +630,25 @@ g_once_impl (GOnce       *once,

  if (once->status != G_ONCE_STATUS_READY)
    {
+      gpointer retval;
+
      once->status = G_ONCE_STATUS_PROGRESS;
      g_mutex_unlock (&g_once_mutex);

-      once->retval = func (arg);
+      retval = func (arg);

      g_mutex_lock (&g_once_mutex);
+/* We prefer the new C11-style atomic extension of GCC if available. If not,
+ * fall back to always locking. */
+#if defined(G_ATOMIC_LOCK_FREE) && defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4) && defined(__ATOMIC_SEQ_CST)
+      /* Only the second store needs to be atomic, as the two writes are related
+       * by a happens-before relationship here. */
+      once->retval = retval;
+      __atomic_store_n (&once->status, G_ONCE_STATUS_READY, __ATOMIC_RELEASE);
+#else
+      once->retval = retval;
      once->status = G_ONCE_STATUS_READY;
+#endif
      g_cond_broadcast (&g_once_cond);
    }

--- a/glib/gthread.h
+++ b/glib/gthread.h
@ -234,14 +234,23 @@ GLIB_AVAILABLE_IN_ALL
 void            g_once_init_leave               (volatile void  *location,
                                                 gsize           result);

-#ifdef G_ATOMIC_OP_MEMORY_BARRIER_NEEDED
-# define g_once(once, func, arg) g_once_impl ((once), (func), (arg))
-#else /* !G_ATOMIC_OP_MEMORY_BARRIER_NEEDED*/
+/* Use C11-style atomic extensions to check the fast path for status=ready. If
+ * they are not available, fall back to using a mutex and condition variable in
+ * g_once_impl().
+ *
+ * On the C11-style codepath, only the load of once->status needs to be atomic,
+ * as the writes to it and once->retval in g_once_impl() are related by a
+ * happens-before relation. Release-acquire semantics are defined such that any
+ * atomic/non-atomic write which happens-before a store/release is guaranteed to
+ * be seen by the load/acquire of the same atomic variable. */
+#if defined(G_ATOMIC_LOCK_FREE) && defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4) && defined(__ATOMIC_SEQ_CST)
 # define g_once(once, func, arg) \
-  (((once)->status == G_ONCE_STATUS_READY) ? \
+  ((__atomic_load_n (&(once)->status, __ATOMIC_ACQUIRE) == G_ONCE_STATUS_READY) ? \
   (once)->retval : \
   g_once_impl ((once), (func), (arg)))
-#endif /* G_ATOMIC_OP_MEMORY_BARRIER_NEEDED */
+#else
+# define g_once(once, func, arg) g_once_impl ((once), (func), (arg))
+#endif

 #ifdef __GNUC__
 # define g_once_init_enter(location) \
--- a/glib/tests/once.c
+++ b/glib/tests/once.c
@ -34,11 +34,13 @@ do_once (gpointer data)
 }

 static void
-test_once1 (void)
+test_once_single_threaded (void)
 {
  GOnce once = G_ONCE_INIT;
  gpointer res;

+  g_test_summary ("Test g_once() usage from a single thread");
+
  g_assert (once.status == G_ONCE_STATUS_NOTCALLED);

  res = g_once (&once, do_once, NULL);
@ -50,11 +52,80 @@ test_once1 (void)
  g_assert_cmpint (GPOINTER_TO_INT (res), ==, 1);
 }

+static GOnce once_multi_threaded = G_ONCE_INIT;
+static gint once_multi_threaded_counter = 0;
+static GCond once_multi_threaded_cond;
+static GMutex once_multi_threaded_mutex;
+static guint once_multi_threaded_n_threads_waiting = 0;
+
+static gpointer
+do_once_multi_threaded (gpointer data)
+{
+  gint old_value;
+
+  /* While this function should only ever be executed once, by one thread,
+   * we should use atomics to ensure that if there were a bug, writes to
+   * `once_multi_threaded_counter` from multiple threads would not get lost and
+   * mean the test erroneously succeeded. */
+  old_value = g_atomic_int_add (&once_multi_threaded_counter, 1);
+
+  return GINT_TO_POINTER (old_value + 1);
+}
+
+static gpointer
+once_thread_func (gpointer data)
+{
+  gpointer res;
+  guint n_threads_expected = GPOINTER_TO_UINT (data);
+
+  /* Don’t immediately call g_once(), otherwise the first thread to be created
+   * will end up calling the once-function, and there will be very little
+   * contention. */
+  g_mutex_lock (&once_multi_threaded_mutex);
+
+  once_multi_threaded_n_threads_waiting++;
+  g_cond_broadcast (&once_multi_threaded_cond);
+
+  while (once_multi_threaded_n_threads_waiting < n_threads_expected)
+    g_cond_wait (&once_multi_threaded_cond, &once_multi_threaded_mutex);
+  g_mutex_unlock (&once_multi_threaded_mutex);
+
+  /* Actually run the test. */
+  res = g_once (&once_multi_threaded, do_once_multi_threaded, NULL);
+  g_assert_cmpint (GPOINTER_TO_INT (res), ==, 1);
+
+  return NULL;
+}
+
 static void
-test_once2 (void)
+test_once_multi_threaded (void)
+{
+  guint i;
+  GThread *threads[1000];
+
+  g_test_summary ("Test g_once() usage from multiple threads");
+
+  for (i = 0; i < G_N_ELEMENTS (threads); i++)
+    threads[i] = g_thread_new ("once-multi-threaded",
+                               once_thread_func,
+                               GUINT_TO_POINTER (G_N_ELEMENTS (threads)));
+
+  /* All threads have started up, so start the test. */
+  g_cond_broadcast (&once_multi_threaded_cond);
+
+  for (i = 0; i < G_N_ELEMENTS (threads); i++)
+    g_thread_join (threads[i]);
+
+  g_assert_cmpint (g_atomic_int_get (&once_multi_threaded_counter), ==, 1);
+}
+
+static void
+test_once_init_single_threaded (void)
 {
  static gsize init = 0;

+  g_test_summary ("Test g_once_init_{enter,leave}() usage from a single thread");
+
  if (g_once_init_enter (&init))
    {
      g_assert (TRUE);
@ -96,15 +167,17 @@ thread_func (gpointer data)
 }

 static void
-test_once3 (void)
+test_once_init_multi_threaded (void)
 {
  gint i;
  GThread *threads[THREADS];

+  g_test_summary ("Test g_once_init_{enter,leave}() usage from multiple threads");
+
  shared = 0;

  for (i = 0; i < THREADS; i++)
-    threads[i] = g_thread_new ("once3", thread_func, NULL);
+    threads[i] = g_thread_new ("once-init-multi-threaded", thread_func, NULL);

  for (i = 0; i < THREADS; i++)
    g_thread_join (threads[i]);
@ -113,10 +186,12 @@ test_once3 (void)
 }

 static void
-test_once4 (void)
+test_once_init_string (void)
 {
  static const gchar *val;

+  g_test_summary ("Test g_once_init_{enter,leave}() usage with a string");
+
  if (g_once_init_enter (&val))
    g_once_init_leave (&val, "foo");

@ -128,10 +203,11 @@ main (int argc, char *argv[])
 {
  g_test_init (&argc, &argv, NULL);

-  g_test_add_func ("/thread/once1", test_once1);
-  g_test_add_func ("/thread/once2", test_once2);
-  g_test_add_func ("/thread/once3", test_once3);
-  g_test_add_func ("/thread/once4", test_once4);
+  g_test_add_func ("/once/single-threaded", test_once_single_threaded);
+  g_test_add_func ("/once/multi-threaded", test_once_multi_threaded);
+  g_test_add_func ("/once-init/single-threaded", test_once_init_single_threaded);
+  g_test_add_func ("/once-init/multi-threaded", test_once_init_multi_threaded);
+  g_test_add_func ("/once-init/string", test_once_init_string);

  return g_test_run ();
 }
--- a/gobject/tests/meson.build
+++ b/gobject/tests/meson.build
@ -105,7 +105,7 @@ foreach test_name, extra_args : gobject_tests

  # FIXME: https://gitlab.gnome.org/GNOME/glib/issues/1316
  # aka https://bugs.debian.org/880883
-  if test_name == 'closure-refcount' and ['arm', 'aarch64'].contains(host_cpu_family)
+  if test_name == 'closure-refcount' and ['arm', 'aarch64'].contains(host_machine.cpu_family())
    timeout = timeout * 10
  endif

--- a/meson.build
+++ b/meson.build
@ -1676,18 +1676,6 @@ foreach d : inet_defines
  glibconfig_conf.set(d[1], val)
 endforeach

-# We need a more robust approach here...
-host_cpu_family = host_machine.cpu_family()
-if host_cpu_family == 'x86' or host_cpu_family == 'x86_64' or host_cpu_family == 's390' or host_cpu_family == 's390x' or host_cpu_family.startswith('arm') or host_cpu_family == 'aarch64' or host_cpu_family.startswith('crisv32') or host_cpu_family.startswith('etrax')
-  glib_memory_barrier_needed = false
-elif host_cpu_family.startswith('sparc') or host_cpu_family.startswith('alpha') or host_cpu_family.startswith('powerpc') or host_cpu_family == 'ia64'
-  glib_memory_barrier_needed = true
-else
-  warning('Unknown host cpu: ' + host_cpu_family)
-  glib_memory_barrier_needed = true
-endif
-glibconfig_conf.set('G_ATOMIC_OP_MEMORY_BARRIER_NEEDED', glib_memory_barrier_needed)
-
 # We need to decide at configure time if GLib will use real atomic
 # operations ("lock free") or emulated ones with a mutex.  This is
 # because we must put this information in glibconfig.h so we know if