Merge branch 'th/gdataset-index' into 'main'

[th/gdataset-index] add a lookup index (GHashTable) to `GData`

See merge request GNOME/glib!3885
This commit is contained in:
Thomas Haller 2024-07-15 21:20:05 +00:00
commit 86752d9dc8
2 changed files with 347 additions and 97 deletions

View File

@ -68,6 +68,22 @@
#define G_DATALIST_FLAGS_MASK_INTERNAL 0x7
/* When GData.alloc grows to size ALLOC_THRESHOLD_INDEX, we reserve one additional
* GHashTable* at &data->data[data->alloc]. This will contain the index for fast
* lookup. See datalist_index*() helpers.
*
* Note that we grow the GData.data buffer by doubling the allocation size. So
* we first allocate 64 entries, when adding the 33 entry.
*
* Conversely, we exponentially shrink the buffer. That means, when we remove
* entries and reach 16 (or lower), we will shrink the buffer from 64 to 32
* entries (and stop using the index).
*
* So we start using the index when adding >= 33 entries. And stop using it
* when removing to <= 16 entries.
*/
#define ALLOC_THRESHOLD_INDEX 64u
#define G_DATALIST_CLEAN_POINTER(ptr) \
((GData *) ((gpointer) (((guintptr) (ptr)) & ~((guintptr) G_DATALIST_FLAGS_MASK_INTERNAL))))
@ -158,53 +174,178 @@ g_datalist_unlock_and_set (GData **datalist, gpointer ptr)
g_pointer_bit_unlock_and_set ((void **) datalist, DATALIST_LOCK_BIT, ptr, G_DATALIST_FLAGS_MASK_INTERNAL);
}
static gsize
datalist_alloc_size (guint32 alloc)
{
/* GDataElt also contains pointer. It thus is suitable aligned for pointers,
* and we can just append the pointer for the index at the end. */
return G_STRUCT_OFFSET (GData, data) +
(((gsize) alloc) * sizeof (GDataElt)) +
(G_UNLIKELY (alloc >= ALLOC_THRESHOLD_INDEX) ? sizeof (GHashTable *) : 0u);
}
G_ALWAYS_INLINE static inline GHashTable **
datalist_index_get_ptr (GData *data)
{
if (G_LIKELY (data->alloc < ALLOC_THRESHOLD_INDEX))
return NULL;
return (gpointer) (&(data->data[data->alloc]));
}
G_ALWAYS_INLINE static inline GHashTable *
datalist_index_get (GData *data)
{
GHashTable **p_index;
p_index = datalist_index_get_ptr (data);
#if G_ENABLE_DEBUG
g_assert (!p_index || *p_index);
#endif
return G_UNLIKELY (p_index) ? *p_index : NULL;
}
static guint
_datalist_index_hash (gconstpointer key)
{
const GQuark *ptr = key;
G_STATIC_ASSERT (G_STRUCT_OFFSET (GDataElt, key) == 0);
return *ptr;
}
static gboolean
_datalist_index_equal (gconstpointer a, gconstpointer b)
{
const GQuark *ptr_a = a;
const GQuark *ptr_b = b;
return *ptr_a == *ptr_b;
}
G_ALWAYS_INLINE static inline GHashTable *
datalist_index_new (void)
{
return g_hash_table_new (_datalist_index_hash, _datalist_index_equal);
}
static GData *
datalist_realloc (GData *data, guint32 alloc, gboolean *out_reallocated)
{
guintptr data_old;
gboolean reallocated;
GHashTable *index;
GHashTable **p_index;
guint32 i;
data_old = (guintptr) ((gpointer) data);
index = datalist_index_get (data);
data = g_realloc (data, datalist_alloc_size (alloc));
reallocated = (((guintptr) ((gpointer) (data))) != data_old);
data->alloc = alloc;
if (out_reallocated)
*out_reallocated = reallocated;
/* Note that if data was @reallocated, then @index contains only dangling pointers.
* We can only destroy/remove-all, which we rely on not following those pointers. */
p_index = datalist_index_get_ptr (data);
if (G_LIKELY (!p_index))
{
if (G_UNLIKELY (index))
g_hash_table_unref (index);
}
else if (!reallocated && index)
{
/* The index is still fine and the pointers are all still valid. We
* can keep it. */
*p_index = index;
}
else
{
if (G_UNLIKELY (index))
{
/* Note that the GHashTable's keys are now all dangling pointers!
* We rely on remove-all to not following them. */
g_hash_table_remove_all (index);
}
else
index = datalist_index_new ();
*p_index = index;
for (i = 0; i < data->len; i++)
g_hash_table_add (index, &data->data[i]);
}
return data;
}
static gboolean
datalist_append (GData **data, GQuark key_id, gpointer new_data, GDestroyNotify destroy_func)
{
GDataElt *data_elt;
GHashTable *index;
gboolean reallocated;
GData *d;
d = *data;
if (!d)
{
d = g_malloc (G_STRUCT_OFFSET (GData, data) + 2u * sizeof (GDataElt));
d = g_malloc (datalist_alloc_size (2u));
d->len = 0;
d->alloc = 2u;
if (2u >= ALLOC_THRESHOLD_INDEX)
*(datalist_index_get_ptr (d)) = datalist_index_new ();
*data = d;
reallocated = TRUE;
}
else if (d->len == d->alloc)
{
d->alloc = d->alloc * 2u;
#if G_ENABLE_DEBUG
/* d->alloc is always a power of two. It thus overflows the first time
* when going to (G_MAXUINT32+1), or when requesting 2^31+1 elements.
*
* This is not handled, and we just crash. That's because we track the GData
* in a linear list, which horribly degrades long before we add 2 billion entries.
* Don't ever try to do that. */
g_assert (d->alloc > d->len);
#endif
d = g_realloc (d, G_STRUCT_OFFSET (GData, data) + d->alloc * sizeof (GDataElt));
guint32 alloc = d->alloc * 2u;
if (G_UNLIKELY (alloc < d->alloc))
{
if (d->alloc == G_MAXUINT32)
g_error ("GData cannot contain more than 4294967295 entries");
alloc = G_MAXUINT32;
}
d = datalist_realloc (d, alloc, &reallocated);
*data = d;
reallocated = TRUE;
}
else
reallocated = FALSE;
d->data[d->len] = (GDataElt){
data_elt = &d->data[d->len];
*data_elt = (GDataElt){
.key = key_id,
.data = new_data,
.destroy = destroy_func,
};
d->len++;
index = datalist_index_get (d);
if (G_UNLIKELY (index))
g_hash_table_add (index, data_elt);
return reallocated;
}
static void
datalist_remove (GData *data, guint32 idx)
{
GHashTable *index;
#if G_ENABLE_DEBUG
g_assert (idx < data->len);
#endif
@ -214,15 +355,24 @@ datalist_remove (GData *data, guint32 idx)
* to @idx are left unchanged, and the last entry is moved to position @idx.
* */
index = datalist_index_get (data);
if (G_UNLIKELY (index))
g_hash_table_remove (index, &data->data[idx]);
data->len--;
if (idx != data->len)
data->data[idx] = data->data[data->len];
{
data->data[idx] = data->data[data->len];
if (G_UNLIKELY (index))
g_hash_table_add (index, &data->data[idx]);
}
}
static gboolean
datalist_shrink (GData **data, GData **d_to_free)
{
gboolean reallocated;
guint32 alloc_by_4;
guint32 v;
GData *d;
@ -239,10 +389,17 @@ datalist_shrink (GData **data, GData **d_to_free)
if (d->len == 0)
{
GHashTable *index;
/* The list became empty. We drop the allocated memory altogether. */
/* The caller will free the buffer after releasing the lock, to minimize
* the time we hold the lock. Transfer it out. */
index = datalist_index_get (d);
if (G_UNLIKELY (index))
g_hash_table_unref (index);
*d_to_free = d;
*data = NULL;
return TRUE;
@ -253,8 +410,9 @@ datalist_shrink (GData **data, GData **d_to_free)
v = d->len;
if (v != alloc_by_4)
{
/* d->alloc is a power of two. Usually, we remove one element at a
* time, then we will just reach reach a quarter of that.
/* d->alloc is a power of two (unless it's G_MAXUINT32). Usually, we
* remove one element at a time, then we will just reach reach a quarter
* of that.
*
* However, with g_datalist_id_remove_multiple(), len can be smaller
* at once. In that case, find first the next power of two. */
@ -264,27 +422,55 @@ datalist_shrink (GData **data, GData **d_to_free)
#if G_ENABLE_DEBUG
g_assert (v > d->len);
g_assert (v <= d->alloc / 2u);
g_assert (v <= (d->alloc == G_MAXUINT32 ? 0x80000000u : d->alloc / 2u));
#endif
d->alloc = v;
d = g_realloc (d, G_STRUCT_OFFSET (GData, data) + (v * sizeof (GDataElt)));
d = datalist_realloc (d, v, &reallocated);
*d_to_free = NULL;
*data = d;
return TRUE;
return reallocated;
}
static void
datalist_destroy (GData *data)
{
GHashTable *index;
guint32 i;
/* Must be called without lock. Will free @data and invoke the
* destroy() notifications. */
index = datalist_index_get (data);
if (G_UNLIKELY (index))
g_hash_table_unref (index);
for (i = 0; i < data->len; i++)
{
if (data->data[i].destroy)
data->data[i].destroy (data->data[i].data);
}
g_free (data);
}
static GDataElt *
datalist_find (GData *data, GQuark key_id, guint32 *out_idx)
{
GDataElt *data_elt;
GHashTable *index;
guint32 i;
if (data)
if (!data)
goto out_not_found;
index = datalist_index_get (data);
if (G_LIKELY (!index))
{
/* We have no index. Do a linear search. */
for (i = 0; i < data->len; i++)
{
GDataElt *data_elt = &data->data[i];
data_elt = &data->data[i];
if (data_elt->key == key_id)
{
if (out_idx)
@ -292,7 +478,23 @@ datalist_find (GData *data, GQuark key_id, guint32 *out_idx)
return data_elt;
}
}
goto out_not_found;
}
data_elt = g_hash_table_lookup (index, &key_id);
if (!data_elt)
goto out_not_found;
#if G_ENABLE_DEBUG
g_assert (data_elt >= data->data && data_elt < &data->data[data->len]);
#endif
if (out_idx)
*out_idx = (data_elt - data->data);
return data_elt;
out_not_found:
if (out_idx)
*out_idx = G_MAXUINT32;
return NULL;
@ -310,7 +512,6 @@ void
g_datalist_clear (GData **datalist)
{
GData *data;
guint i;
g_return_if_fail (datalist != NULL);
@ -324,13 +525,7 @@ g_datalist_clear (GData **datalist)
g_datalist_unlock_and_set (datalist, NULL);
for (i = 0; i < data->len; i++)
{
if (data->data[i].data && data->data[i].destroy)
data->data[i].destroy (data->data[i].data);
}
g_free (data);
datalist_destroy (data);
}
/* HOLDS: g_dataset_global_lock */
@ -359,7 +554,6 @@ g_dataset_destroy_internal (GDataset *dataset)
while (dataset)
{
GData *data;
guint i;
data = G_DATALIST_GET_POINTER (&dataset->datalist);
@ -376,12 +570,7 @@ g_dataset_destroy_internal (GDataset *dataset)
G_UNLOCK (g_dataset_global);
for (i = 0; i < data->len; i++)
{
if (data->data[i].data && data->data[i].destroy)
data->data[i].destroy (data->data[i].data);
}
g_free (data);
datalist_destroy (data);
G_LOCK (g_dataset_global);
dataset = g_dataset_lookup (dataset_location);
@ -530,7 +719,9 @@ g_data_remove_internal (GData **datalist,
GData *d_to_free;
gsize found_keys;
gsize i_keys;
guint32 i_data;
if (n_keys == 0)
return;
d = g_datalist_lock_and_get (datalist);
@ -556,33 +747,23 @@ g_data_remove_internal (GData **datalist,
old = old_to_free;
}
i_data = 0;
found_keys = 0;
while (i_data < d->len && found_keys < n_keys)
for (i_keys = 0; i_keys < n_keys; i_keys++)
{
GDataElt *data = &d->data[i_data];
gboolean remove = FALSE;
GDataElt *data_elt;
guint32 idx;
for (i_keys = 0; i_keys < n_keys; i_keys++)
{
if (data->key == keys[i_keys])
{
/* We must invoke the destroy notifications in the order of @keys.
* Hence, build up the list @old at index @i_keys. */
old[i_keys] = *data;
found_keys++;
remove = TRUE;
break;
}
}
data_elt = datalist_find (d, keys[i_keys], &idx);
if (!data_elt)
continue;
if (!remove)
{
i_data++;
continue;
}
datalist_remove (d, i_data);
/* We must destroy the keys in the order in which they are specified.
* We achieve that here.
*
* Note that even if @keys contains duplicates, we correctly only
* find them once, as we remove the found entry right away. */
old[found_keys++] = *data_elt;
datalist_remove (d, idx);
}
if (found_keys > 0 && datalist_shrink (&d, &d_to_free))
@ -594,13 +775,10 @@ g_data_remove_internal (GData **datalist,
else
g_datalist_unlock (datalist);
if (found_keys > 0)
for (i_keys = 0; i_keys < found_keys; i_keys++)
{
for (i_keys = 0; i_keys < n_keys; i_keys++)
{
if (old[i_keys].destroy)
old[i_keys].destroy (old[i_keys].data);
}
if (old[i_keys].destroy)
old[i_keys].destroy (old[i_keys].data);
}
if (G_UNLIKELY (old_to_free))
@ -803,9 +981,7 @@ g_datalist_id_set_data_full (GData **datalist,
* This is more efficient than calling g_datalist_id_remove_data()
* multiple times in a row.
*
* Before 2.80, @n_keys had to be not larger than 16. Now it can be larger, but
* note that GData does a linear search, so an excessive number of keys will
* perform badly.
* Before 2.80, @n_keys had to be not larger than 16.
*
* Since: 2.74
*/
@ -1228,37 +1404,56 @@ g_datalist_id_replace_data (GData **datalist,
* is not found.
**/
gpointer
g_datalist_get_data (GData **datalist,
const gchar *key)
g_datalist_get_data (GData **datalist,
const gchar *key)
{
GQuark key_id;
GHashTable *index;
gpointer res = NULL;
GDataElt *data_elt;
GData *d;
GDataElt *data, *data_end;
g_return_val_if_fail (datalist != NULL, NULL);
d = g_datalist_lock_and_get (datalist);
if (d)
if (!d)
goto out;
index = datalist_index_get (d);
if (G_LIKELY (!index))
{
data = d->data;
data_end = data + d->len;
while (data < data_end)
{
/* Here we intentionally compare by strings, instead of calling
* g_quark_try_string() first.
*
* See commit 1cceda49b60b ('Make g_datalist_get_data not look up the
* quark').
*/
if (g_strcmp0 (g_quark_to_string (data->key), key) == 0)
{
res = data->data;
break;
}
data++;
}
guint32 i;
for (i = 0; i < d->len; i++)
{
data_elt = &d->data[i];
/* Here we intentionally compare by strings, instead of calling
* g_quark_try_string() first.
*
* See commit 1cceda49b60b ('Make g_datalist_get_data not look up the
* quark').
*/
if (g_strcmp0 (g_quark_to_string (data_elt->key), key) == 0)
{
res = data_elt->data;
goto out;
}
}
goto out;
}
key_id = g_quark_try_string (key);
if (key_id == 0 && key)
goto out;
data_elt = g_hash_table_lookup (index, &key_id);
if (data_elt)
res = data_elt->data;
out:
g_datalist_unlock (datalist);
return res;

View File

@ -305,6 +305,7 @@ test_datalist_id_remove_multiple_resize (void)
{
GQuark *quarks;
GQuark *quarks2;
gboolean *has;
const guint N = 1000;
const guint PRIME = 1048583u;
guint i;
@ -314,6 +315,7 @@ test_datalist_id_remove_multiple_resize (void)
quarks = g_new (GQuark, N);
quarks2 = g_new (GQuark, N);
has = g_new0 (gboolean, N);
for (i = 0; i < N; i++)
{
@ -322,12 +324,15 @@ test_datalist_id_remove_multiple_resize (void)
}
for (i = 0; i < N; i++)
g_datalist_id_set_data (&list, quarks[i], GINT_TO_POINTER (i));
{
g_datalist_id_set_data (&list, quarks[i], (gpointer) g_quark_to_string (quarks[i]));
has[i] = TRUE;
}
/* Now we perform a list of random operations (remove/add quarks). */
for (i_run = 0; TRUE; i_run++)
{
int MODE = ((guint) g_test_rand_int ()) % 4;
int MODE = ((guint) g_test_rand_int ()) % 6;
guint n;
guint j;
@ -355,9 +360,15 @@ test_datalist_id_remove_multiple_resize (void)
{
j = (j + PRIME) % N;
if (MODE == 0)
g_datalist_id_remove_data (&list, quarks[j]);
{
g_datalist_id_remove_data (&list, quarks[j]);
has[j] = FALSE;
}
else
g_datalist_id_set_data (&list, quarks[j], GINT_TO_POINTER (j));
{
g_datalist_id_set_data (&list, quarks[j], (gpointer) g_quark_to_string (quarks[j]));
has[j] = TRUE;
}
}
break;
case 3:
@ -366,15 +377,59 @@ test_datalist_id_remove_multiple_resize (void)
{
j = (j + PRIME) % N;
quarks2[i] = quarks[j];
has[j] = FALSE;
}
g_datalist_id_remove_multiple (&list, quarks2, n);
break;
case 4:
/* Mode: lookup string via g_datalist_get_data()*/
for (i = 0; i < n; i++)
{
const char *data;
const char *data2;
const char *key;
j = (j + PRIME) % N;
key = g_quark_to_string (quarks[j]);
data = g_datalist_id_get_data (&list, quarks[j]);
data2 = g_datalist_get_data (&list, key);
g_assert_true (data == data2);
if (data)
g_assert_true (data == key);
g_assert_true ((!!data) == has[j]);
}
break;
case 5:
/* Fill/empty the list completely. */
switch (((guint) g_test_rand_int ()) % 5)
{
case 0:
g_datalist_clear (&list);
for (i = 0; i < N; i++)
has[i] = FALSE;
break;
case 1:
for (i = 0; i < N; i++)
{
j = (j + PRIME) % N;
g_datalist_id_set_data (&list, quarks[j], (gpointer) g_quark_to_string (quarks[j]));
has[i] = TRUE;
}
break;
default:
/* Most of the time we do nothing. The case where we fill/empty
* the list entirely is less interesting. */
break;
}
}
}
g_free (quarks);
g_free (quarks2);
g_free (has);
}
static void