Accepting request 1107915 from Base:System

- Add cross-ppc64le package (forwarded request 1107913 from Andreas_Schwab)

OBS-URL: https://build.opensuse.org/request/show/1107915
OBS-URL: https://build.opensuse.org/package/show/openSUSE:Factory/glibc?expand=0&rev=278
This commit is contained in:
Ana Guerrero 2023-08-30 08:19:27 +00:00 committed by Git OBS Bridge
commit 3481e3ba51
7 changed files with 963 additions and 9 deletions

View File

@ -3,5 +3,7 @@
<package>utils</package>
<package>testsuite</package>
<package>cross-aarch64</package>
<package>cross-ppc64le</package>
<package>cross-riscv64</package>
<package>cross-s390x</package>
</multibuild>

286
cache-amd-legacy.patch Normal file
View File

@ -0,0 +1,286 @@
From ced101ed9d3b7cfd12d97ef24940cb00b8658c81 Mon Sep 17 00:00:00 2001
From: Sajan Karumanchi <sajan.karumanchi@amd.com>
Date: Tue, 1 Aug 2023 15:20:55 +0000
Subject: [PATCH] x86: Fix for cache computation on AMD legacy cpus.
Some legacy AMD CPUs and hypervisors have the _cpuid_ '0x8000_001D'
set to Zero, thus resulting in zeroed-out computed cache values.
This patch reintroduces the old way of cache computation as a
fail-safe option to handle these exceptions.
Fixed 'level4_cache_size' value through handle_amd().
Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com>
Tested-by: Florian Weimer <fweimer@redhat.com>
---
sysdeps/x86/dl-cacheinfo.h | 226 ++++++++++++++++++++++++++++++++-----
1 file changed, 199 insertions(+), 27 deletions(-)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index cd4d0351ae..285773039f 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -315,40 +315,206 @@ handle_amd (int name)
{
unsigned int eax;
unsigned int ebx;
- unsigned int ecx;
+ unsigned int ecx = 0;
unsigned int edx;
- unsigned int count = 0x1;
+ unsigned int max_cpuid = 0;
+ unsigned int fn = 0;
/* No level 4 cache (yet). */
if (name > _SC_LEVEL3_CACHE_LINESIZE)
return 0;
- if (name >= _SC_LEVEL3_CACHE_SIZE)
- count = 0x3;
- else if (name >= _SC_LEVEL2_CACHE_SIZE)
- count = 0x2;
- else if (name >= _SC_LEVEL1_DCACHE_SIZE)
- count = 0x0;
+ __cpuid (0x80000000, max_cpuid, ebx, ecx, edx);
+
+ if (max_cpuid >= 0x8000001D)
+ /* Use __cpuid__ '0x8000_001D' to compute cache details. */
+ {
+ unsigned int count = 0x1;
+
+ if (name >= _SC_LEVEL3_CACHE_SIZE)
+ count = 0x3;
+ else if (name >= _SC_LEVEL2_CACHE_SIZE)
+ count = 0x2;
+ else if (name >= _SC_LEVEL1_DCACHE_SIZE)
+ count = 0x0;
+
+ __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
+
+ if (ecx != 0)
+ {
+ switch (name)
+ {
+ case _SC_LEVEL1_ICACHE_ASSOC:
+ case _SC_LEVEL1_DCACHE_ASSOC:
+ case _SC_LEVEL2_CACHE_ASSOC:
+ case _SC_LEVEL3_CACHE_ASSOC:
+ return ((ebx >> 22) & 0x3ff) + 1;
+ case _SC_LEVEL1_ICACHE_LINESIZE:
+ case _SC_LEVEL1_DCACHE_LINESIZE:
+ case _SC_LEVEL2_CACHE_LINESIZE:
+ case _SC_LEVEL3_CACHE_LINESIZE:
+ return (ebx & 0xfff) + 1;
+ case _SC_LEVEL1_ICACHE_SIZE:
+ case _SC_LEVEL1_DCACHE_SIZE:
+ case _SC_LEVEL2_CACHE_SIZE:
+ case _SC_LEVEL3_CACHE_SIZE:
+ return (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1);
+ default:
+ __builtin_unreachable ();
+ }
+ return -1;
+ }
+ }
+
+ /* Legacy cache computation for CPUs prior to Bulldozer family.
+ This is also a fail-safe mechanism for some hypervisors that
+ accidentally configure __cpuid__ '0x8000_001D' to Zero. */
- __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
+ fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE);
+
+ if (max_cpuid < fn)
+ return 0;
+
+ __cpuid (fn, eax, ebx, ecx, edx);
+
+ if (name < _SC_LEVEL1_DCACHE_SIZE)
+ {
+ name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
+ ecx = edx;
+ }
switch (name)
{
- case _SC_LEVEL1_ICACHE_ASSOC:
- case _SC_LEVEL1_DCACHE_ASSOC:
- case _SC_LEVEL2_CACHE_ASSOC:
+ case _SC_LEVEL1_DCACHE_SIZE:
+ return (ecx >> 14) & 0x3fc00;
+
+ case _SC_LEVEL1_DCACHE_ASSOC:
+ ecx >>= 16;
+ if ((ecx & 0xff) == 0xff)
+ {
+ /* Fully associative. */
+ return (ecx << 2) & 0x3fc00;
+ }
+ return ecx & 0xff;
+
+ case _SC_LEVEL1_DCACHE_LINESIZE:
+ return ecx & 0xff;
+
+ case _SC_LEVEL2_CACHE_SIZE:
+ return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00;
+
+ case _SC_LEVEL2_CACHE_ASSOC:
+ switch ((ecx >> 12) & 0xf)
+ {
+ case 0:
+ case 1:
+ case 2:
+ case 4:
+ return (ecx >> 12) & 0xf;
+ case 6:
+ return 8;
+ case 8:
+ return 16;
+ case 10:
+ return 32;
+ case 11:
+ return 48;
+ case 12:
+ return 64;
+ case 13:
+ return 96;
+ case 14:
+ return 128;
+ case 15:
+ return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff);
+ default:
+ return 0;
+ }
+
+ case _SC_LEVEL2_CACHE_LINESIZE:
+ return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff;
+
+ case _SC_LEVEL3_CACHE_SIZE:
+ {
+ long int total_l3_cache = 0, l3_cache_per_thread = 0;
+ unsigned int threads = 0;
+ const struct cpu_features *cpu_features;
+
+ if ((edx & 0xf000) == 0)
+ return 0;
+
+ total_l3_cache = (edx & 0x3ffc0000) << 1;
+ cpu_features = __get_cpu_features ();
+
+ /* Figure out the number of logical threads that share L3. */
+ if (max_cpuid >= 0x80000008)
+ {
+ /* Get width of APIC ID. */
+ __cpuid (0x80000008, eax, ebx, ecx, edx);
+ threads = (ecx & 0xff) + 1;
+ }
+
+ if (threads == 0)
+ {
+ /* If APIC ID width is not available, use logical
+ processor count. */
+ __cpuid (0x00000001, eax, ebx, ecx, edx);
+ if ((edx & (1 << 28)) != 0)
+ threads = (ebx >> 16) & 0xff;
+ }
+
+ /* Cap usage of highest cache level to the number of
+ supported threads. */
+ if (threads > 0)
+ l3_cache_per_thread = total_l3_cache/threads;
+
+ /* Get shared cache per ccx for Zen architectures. */
+ if (cpu_features->basic.family >= 0x17)
+ {
+ long int l3_cache_per_ccx = 0;
+ /* Get number of threads share the L3 cache in CCX. */
+ __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx);
+ unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
+ l3_cache_per_ccx = l3_cache_per_thread * threads_per_ccx;
+ return l3_cache_per_ccx;
+ }
+ else
+ {
+ return l3_cache_per_thread;
+ }
+ }
+
case _SC_LEVEL3_CACHE_ASSOC:
- return ecx ? ((ebx >> 22) & 0x3ff) + 1 : 0;
- case _SC_LEVEL1_ICACHE_LINESIZE:
- case _SC_LEVEL1_DCACHE_LINESIZE:
- case _SC_LEVEL2_CACHE_LINESIZE:
+ switch ((edx >> 12) & 0xf)
+ {
+ case 0:
+ case 1:
+ case 2:
+ case 4:
+ return (edx >> 12) & 0xf;
+ case 6:
+ return 8;
+ case 8:
+ return 16;
+ case 10:
+ return 32;
+ case 11:
+ return 48;
+ case 12:
+ return 64;
+ case 13:
+ return 96;
+ case 14:
+ return 128;
+ case 15:
+ return ((edx & 0x3ffc0000) << 1) / (edx & 0xff);
+ default:
+ return 0;
+ }
+
case _SC_LEVEL3_CACHE_LINESIZE:
- return ecx ? (ebx & 0xfff) + 1 : 0;
- case _SC_LEVEL1_ICACHE_SIZE:
- case _SC_LEVEL1_DCACHE_SIZE:
- case _SC_LEVEL2_CACHE_SIZE:
- case _SC_LEVEL3_CACHE_SIZE:
- return ecx ? (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1): 0;
+ return (edx & 0xf000) == 0 ? 0 : edx & 0xff;
+
default:
__builtin_unreachable ();
}
@@ -703,7 +869,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
- shared_per_thread = shared;
level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE);
@@ -716,13 +881,20 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
level3_cache_size = shared;
level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC);
level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE);
+ level4_cache_size = handle_amd (_SC_LEVEL4_CACHE_SIZE);
if (shared <= 0)
- /* No shared L3 cache. All we have is the L2 cache. */
- shared = core;
+ {
+ /* No shared L3 cache. All we have is the L2 cache. */
+ shared = core;
+ }
+ else if (cpu_features->basic.family < 0x17)
+ {
+ /* Account for exclusive L2 and L3 caches. */
+ shared += core;
+ }
- if (shared_per_thread <= 0)
- shared_per_thread = shared;
+ shared_per_thread = shared;
}
cpu_features->level1_icache_size = level1_icache_size;
--
2.41.0

45
cache-intel-shared.patch Normal file
View File

@ -0,0 +1,45 @@
From 5ea70cc02626d9b85f1570153873d8648a47bf95 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 10 Aug 2023 19:28:24 -0500
Subject: [PATCH] x86: Fix incorrect scope of setting `shared_per_thread` [BZ#
30745]
The:
```
if (shared_per_thread > 0 && threads > 0)
shared_per_thread /= threads;
```
Code was accidentally moved to inside the else scope. This doesn't
match how it was previously (before af992e7abd).
This patch fixes that by putting the division after the `else` block.
(cherry picked from commit 084fb31bc2c5f95ae0b9e6df4d3cf0ff43471ede)
---
sysdeps/x86/dl-cacheinfo.h | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 285773039f..5ddb35c9d9 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -770,11 +770,10 @@ get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, u
level. */
threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16)
& 0xff);
-
- /* Get per-thread size of highest level cache. */
- if (shared_per_thread > 0 && threads > 0)
- shared_per_thread /= threads;
}
+ /* Get per-thread size of highest level cache. */
+ if (shared_per_thread > 0 && threads > 0)
+ shared_per_thread /= threads;
}
/* Account for non-inclusive L2 and L3 caches. */
--
2.41.0

View File

@ -1,3 +1,24 @@
Index: glibc-2.38/debug/strcpy_chk.c
===================================================================
--- glibc-2.38.orig/debug/strcpy_chk.c
+++ glibc-2.38/debug/strcpy_chk.c
@@ -31,3 +31,4 @@ __strcpy_chk (char *dest, const char *sr
return memcpy (dest, src, len + 1);
}
+libc_hidden_builtin_def (__strcpy_chk)
Index: glibc-2.38/include/string.h
===================================================================
--- glibc-2.38.orig/include/string.h
+++ glibc-2.38/include/string.h
@@ -213,6 +213,7 @@ libc_hidden_builtin_proto (__memcpy_chk)
libc_hidden_builtin_proto (__memmove_chk)
libc_hidden_builtin_proto (__mempcpy_chk)
libc_hidden_builtin_proto (__memset_chk)
+libc_hidden_builtin_proto (__strcpy_chk)
libc_hidden_builtin_proto (__stpcpy_chk)
#endif
Index: glibc-2.38/intl/loadmsgcat.c
===================================================================
--- glibc-2.38.orig/intl/loadmsgcat.c

View File

@ -1,3 +1,29 @@
Mon Aug 28 11:56:10 UTC 2023 - Richard Biener <rguenther@suse.com>
- Add cross-ppc64le package
-------------------------------------------------------------------
Tue Aug 22 15:46:51 UTC 2023 - Andreas Schwab <schwab@suse.de>
- posix-memalign-fragmentation.patch: malloc: Enable merging of remainders
in memalign, remove bin scanning from memalign (BZ #30723)
- Limit build counter sync to i686 flavor, to reduce needs for rebuilds
-------------------------------------------------------------------
Tue Aug 22 11:24:16 UTC 2023 - Richard Biener <rguenther@suse.com>
- Add cross-s390x package (bsc#1214460)
-------------------------------------------------------------------
Mon Aug 14 08:12:28 UTC 2023 - Andreas Schwab <schwab@suse.de>
- Require that elf/check-localplt does not fail
- glibc-2.3.90-langpackdir.diff: add hidden alias for __strcpy_chk
- cache-amd-legacy.patch: x86: Fix for cache computation on AMD legacy
cpus
- cache-intel-shared.patch: x86: Fix incorrect scope of setting
`shared_per_thread` (BZ# 30745)
-------------------------------------------------------------------
Wed Aug 2 10:50:32 UTC 2023 - Andreas Schwab <schwab@suse.de>

View File

@ -30,15 +30,23 @@
# can use only simple RPM expressions, no lua, no shell, no '{expand:'
# expression :-/ Ideally we'd like to just strip the 'cross_' prefix,
# but we can't. So enumerate the possibilities for now.
%define cross_arch %{cross_cpu}
%if "%flavor" == "cross-aarch64"
%define cross_arch aarch64
%define cross_cpu aarch64
%endif
%if "%flavor" == "cross-riscv64"
%define cross_arch riscv64
%define cross_cpu riscv64
%endif
%if "%flavor" == "cross-s390x"
%define cross_cpu s390x
%endif
%if "%flavor" == "cross-ppc64le"
%define cross_arch ppc64le
%define cross_cpu powerpc64le
%endif
%if 0%{?cross_arch:1}
%define binutils_os %{cross_arch}-suse-linux
%if 0%{?cross_cpu:1}
%define binutils_os %{cross_cpu}-suse-linux
# use same sysroot as in binutils.spec
%define sysroot %{_prefix}/%{binutils_os}/sys-root
%endif
@ -81,7 +89,7 @@ ExclusiveArch: do_not_build
%define build_utils 0
%define build_testsuite 1
%endif
%if 0%{?cross_arch:1}
%if 0%{?cross_cpu:1}
%define build_main 0
%define build_utils 0
%define build_testsuite 0
@ -89,7 +97,7 @@ ExclusiveArch: do_not_build
%undefine _build_create_debug
ExcludeArch: %{cross_arch}
%endif
%define host_arch %{?cross_arch}%{!?cross_arch:%{_target_cpu}}
%define host_arch %{?cross_cpu}%{!?cross_cpu:%{_target_cpu}}
%if %{build_main}
%define name_suffix %{nil}
@ -232,6 +240,8 @@ BuildRequires: cross-%{cross_arch}-linux-glibc-devel
%if "%flavor" == "i686"
ExclusiveArch: i586 i686
BuildArch: i686
# Sync only this build counter with the main build
#!BcntSyncTag: glibc
%endif
###
@ -289,8 +299,14 @@ Patch306: glibc-fix-double-loopback.diff
###
# Patches from upstream
###
# PATCH-FIX-OPENSUSE iconv: restore verbosity with unrecognized encoding names (BZ #30694)
# PATCH-FIX-UPSTREAM iconv: restore verbosity with unrecognized encoding names (BZ #30694)
Patch1000: iconv-error-verbosity.patch
# PATCH-FIX-UPSTREAM x86: Fix for cache computation on AMD legacy cpus
Patch1001: cache-amd-legacy.patch
# PATCH-FIX-UPSTREAM x86: Fix incorrect scope of setting `shared_per_thread` (BZ# 30745)
Patch1002: cache-intel-shared.patch
# PATCH-FIX-UPSTREAM malloc: Enable merging of remainders in memalign, remove bin scanning from memalign (BZ #30723)
Patch1003: posix-memalign-fragmentation.patch
###
# Patches awaiting upstream approval
@ -515,6 +531,9 @@ library in a cross compilation setting.
%if %{without snapshot}
%patch1000 -p1
%patch1001 -p1
%patch1002 -p1
%patch1003 -p1
%endif
%patch2000 -p1
@ -590,8 +609,8 @@ BuildCCplus="%__cxx"
#now overwrite for some architectures
#
%if %{build_cross}
BuildCC=%{cross_arch}-suse-linux-gcc
BuildCCplus=%{cross_arch}-suse-linux-g++
BuildCC=%{cross_cpu}-suse-linux-gcc
BuildCCplus=%{cross_cpu}-suse-linux-g++
%else
%ifarch sparc64
BuildFlags="-O2 -mcpu=ultrasparc -mvis -fcall-used-g6"
@ -754,6 +773,7 @@ make %{?_smp_mflags} %{?make_output_sync} -C cc-base -k check || {
# Exceptions:
# None!
make %{?_smp_mflags} %{?make_output_sync} -C cc-base check-abi
make %{?_smp_mflags} %{?make_output_sync} -C cc-base test t=elf/check-localplt
%endif
%define rtldlib %{_lib}

View File

@ -0,0 +1,554 @@
From 542b1105852568c3ebc712225ae78b8c8ba31a78 Mon Sep 17 00:00:00 2001
From: Florian Weimer <fweimer@redhat.com>
Date: Fri, 11 Aug 2023 11:18:17 +0200
Subject: [PATCH] malloc: Enable merging of remainders in memalign (bug 30723)
Previously, calling _int_free from _int_memalign could put remainders
into the tcache or into fastbins, where they are invisible to the
low-level allocator. This results in missed merge opportunities
because once these freed chunks become available to the low-level
allocator, further memalign allocations (even of the same size are)
likely obstructing merges.
Furthermore, during forwards merging in _int_memalign, do not
completely give up when the remainder is too small to serve as a
chunk on its own. We can still give it back if it can be merged
with the following unused chunk. This makes it more likely that
memalign calls in a loop achieve a compact memory layout,
independently of initial heap layout.
Drop some useless (unsigned long) casts along the way, and tweak
the style to more closely match GNU on changed lines.
Reviewed-by: DJ Delorie <dj@redhat.com>
---
malloc/malloc.c | 197 +++++++++++++++++++++++++++++-------------------
1 file changed, 121 insertions(+), 76 deletions(-)
diff --git a/malloc/malloc.c b/malloc/malloc.c
index e2f1a615a4..948f9759af 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -1086,6 +1086,11 @@ typedef struct malloc_chunk* mchunkptr;
static void* _int_malloc(mstate, size_t);
static void _int_free(mstate, mchunkptr, int);
+static void _int_free_merge_chunk (mstate, mchunkptr, INTERNAL_SIZE_T);
+static INTERNAL_SIZE_T _int_free_create_chunk (mstate,
+ mchunkptr, INTERNAL_SIZE_T,
+ mchunkptr, INTERNAL_SIZE_T);
+static void _int_free_maybe_consolidate (mstate, INTERNAL_SIZE_T);
static void* _int_realloc(mstate, mchunkptr, INTERNAL_SIZE_T,
INTERNAL_SIZE_T);
static void* _int_memalign(mstate, size_t, size_t);
@@ -4637,31 +4642,52 @@ _int_free (mstate av, mchunkptr p, int have_lock)
if (!have_lock)
__libc_lock_lock (av->mutex);
- nextchunk = chunk_at_offset(p, size);
-
- /* Lightweight tests: check whether the block is already the
- top block. */
- if (__glibc_unlikely (p == av->top))
- malloc_printerr ("double free or corruption (top)");
- /* Or whether the next chunk is beyond the boundaries of the arena. */
- if (__builtin_expect (contiguous (av)
- && (char *) nextchunk
- >= ((char *) av->top + chunksize(av->top)), 0))
- malloc_printerr ("double free or corruption (out)");
- /* Or whether the block is actually not marked used. */
- if (__glibc_unlikely (!prev_inuse(nextchunk)))
- malloc_printerr ("double free or corruption (!prev)");
-
- nextsize = chunksize(nextchunk);
- if (__builtin_expect (chunksize_nomask (nextchunk) <= CHUNK_HDR_SZ, 0)
- || __builtin_expect (nextsize >= av->system_mem, 0))
- malloc_printerr ("free(): invalid next size (normal)");
+ _int_free_merge_chunk (av, p, size);
- free_perturb (chunk2mem(p), size - CHUNK_HDR_SZ);
+ if (!have_lock)
+ __libc_lock_unlock (av->mutex);
+ }
+ /*
+ If the chunk was allocated via mmap, release via munmap().
+ */
+
+ else {
+ munmap_chunk (p);
+ }
+}
+
+/* Try to merge chunk P of SIZE bytes with its neighbors. Put the
+ resulting chunk on the appropriate bin list. P must not be on a
+ bin list yet, and it can be in use. */
+static void
+_int_free_merge_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size)
+{
+ mchunkptr nextchunk = chunk_at_offset(p, size);
+
+ /* Lightweight tests: check whether the block is already the
+ top block. */
+ if (__glibc_unlikely (p == av->top))
+ malloc_printerr ("double free or corruption (top)");
+ /* Or whether the next chunk is beyond the boundaries of the arena. */
+ if (__builtin_expect (contiguous (av)
+ && (char *) nextchunk
+ >= ((char *) av->top + chunksize(av->top)), 0))
+ malloc_printerr ("double free or corruption (out)");
+ /* Or whether the block is actually not marked used. */
+ if (__glibc_unlikely (!prev_inuse(nextchunk)))
+ malloc_printerr ("double free or corruption (!prev)");
+
+ INTERNAL_SIZE_T nextsize = chunksize(nextchunk);
+ if (__builtin_expect (chunksize_nomask (nextchunk) <= CHUNK_HDR_SZ, 0)
+ || __builtin_expect (nextsize >= av->system_mem, 0))
+ malloc_printerr ("free(): invalid next size (normal)");
+
+ free_perturb (chunk2mem(p), size - CHUNK_HDR_SZ);
- /* consolidate backward */
- if (!prev_inuse(p)) {
- prevsize = prev_size (p);
+ /* Consolidate backward. */
+ if (!prev_inuse(p))
+ {
+ INTERNAL_SIZE_T prevsize = prev_size (p);
size += prevsize;
p = chunk_at_offset(p, -((long) prevsize));
if (__glibc_unlikely (chunksize(p) != prevsize))
@@ -4669,9 +4695,25 @@ _int_free (mstate av, mchunkptr p, int have_lock)
unlink_chunk (av, p);
}
- if (nextchunk != av->top) {
+ /* Write the chunk header, maybe after merging with the following chunk. */
+ size = _int_free_create_chunk (av, p, size, nextchunk, nextsize);
+ _int_free_maybe_consolidate (av, size);
+}
+
+/* Create a chunk at P of SIZE bytes, with SIZE potentially increased
+ to cover the immediately following chunk NEXTCHUNK of NEXTSIZE
+ bytes (if NEXTCHUNK is unused). The chunk at P is not actually
+ read and does not have to be initialized. After creation, it is
+ placed on the appropriate bin list. The function returns the size
+ of the new chunk. */
+static INTERNAL_SIZE_T
+_int_free_create_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size,
+ mchunkptr nextchunk, INTERNAL_SIZE_T nextsize)
+{
+ if (nextchunk != av->top)
+ {
/* get and clear inuse bit */
- nextinuse = inuse_bit_at_offset(nextchunk, nextsize);
+ bool nextinuse = inuse_bit_at_offset (nextchunk, nextsize);
/* consolidate forward */
if (!nextinuse) {
@@ -4686,8 +4728,8 @@ _int_free (mstate av, mchunkptr p, int have_lock)
been given one chance to be used in malloc.
*/
- bck = unsorted_chunks(av);
- fwd = bck->fd;
+ mchunkptr bck = unsorted_chunks (av);
+ mchunkptr fwd = bck->fd;
if (__glibc_unlikely (fwd->bk != bck))
malloc_printerr ("free(): corrupted unsorted chunks");
p->fd = fwd;
@@ -4706,61 +4748,52 @@ _int_free (mstate av, mchunkptr p, int have_lock)
check_free_chunk(av, p);
}
- /*
- If the chunk borders the current high end of memory,
- consolidate into top
- */
-
- else {
+ else
+ {
+ /* If the chunk borders the current high end of memory,
+ consolidate into top. */
size += nextsize;
set_head(p, size | PREV_INUSE);
av->top = p;
check_chunk(av, p);
}
- /*
- If freeing a large space, consolidate possibly-surrounding
- chunks. Then, if the total unused topmost memory exceeds trim
- threshold, ask malloc_trim to reduce top.
-
- Unless max_fast is 0, we don't know if there are fastbins
- bordering top, so we cannot tell for sure whether threshold
- has been reached unless fastbins are consolidated. But we
- don't want to consolidate on each free. As a compromise,
- consolidation is performed if FASTBIN_CONSOLIDATION_THRESHOLD
- is reached.
- */
+ return size;
+}
- if ((unsigned long)(size) >= FASTBIN_CONSOLIDATION_THRESHOLD) {
+/* If freeing a large space, consolidate possibly-surrounding
+ chunks. Then, if the total unused topmost memory exceeds trim
+ threshold, ask malloc_trim to reduce top. */
+static void
+_int_free_maybe_consolidate (mstate av, INTERNAL_SIZE_T size)
+{
+ /* Unless max_fast is 0, we don't know if there are fastbins
+ bordering top, so we cannot tell for sure whether threshold has
+ been reached unless fastbins are consolidated. But we don't want
+ to consolidate on each free. As a compromise, consolidation is
+ performed if FASTBIN_CONSOLIDATION_THRESHOLD is reached. */
+ if (size >= FASTBIN_CONSOLIDATION_THRESHOLD)
+ {
if (atomic_load_relaxed (&av->have_fastchunks))
malloc_consolidate(av);
- if (av == &main_arena) {
+ if (av == &main_arena)
+ {
#ifndef MORECORE_CANNOT_TRIM
- if ((unsigned long)(chunksize(av->top)) >=
- (unsigned long)(mp_.trim_threshold))
- systrim(mp_.top_pad, av);
+ if (chunksize (av->top) >= mp_.trim_threshold)
+ systrim (mp_.top_pad, av);
#endif
- } else {
- /* Always try heap_trim(), even if the top chunk is not
- large, because the corresponding heap might go away. */
- heap_info *heap = heap_for_ptr(top(av));
+ }
+ else
+ {
+ /* Always try heap_trim, even if the top chunk is not large,
+ because the corresponding heap might go away. */
+ heap_info *heap = heap_for_ptr (top (av));
- assert(heap->ar_ptr == av);
- heap_trim(heap, mp_.top_pad);
- }
+ assert (heap->ar_ptr == av);
+ heap_trim (heap, mp_.top_pad);
+ }
}
-
- if (!have_lock)
- __libc_lock_unlock (av->mutex);
- }
- /*
- If the chunk was allocated via mmap, release via munmap().
- */
-
- else {
- munmap_chunk (p);
- }
}
/*
@@ -5221,7 +5254,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
(av != &main_arena ? NON_MAIN_ARENA : 0));
set_inuse_bit_at_offset (newp, newsize);
set_head_size (p, leadsize | (av != &main_arena ? NON_MAIN_ARENA : 0));
- _int_free (av, p, 1);
+ _int_free_merge_chunk (av, p, leadsize);
p = newp;
assert (newsize >= nb &&
@@ -5232,15 +5265,27 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
if (!chunk_is_mmapped (p))
{
size = chunksize (p);
- if ((unsigned long) (size) > (unsigned long) (nb + MINSIZE))
+ mchunkptr nextchunk = chunk_at_offset(p, size);
+ INTERNAL_SIZE_T nextsize = chunksize(nextchunk);
+ if (size > nb)
{
remainder_size = size - nb;
- remainder = chunk_at_offset (p, nb);
- set_head (remainder, remainder_size | PREV_INUSE |
- (av != &main_arena ? NON_MAIN_ARENA : 0));
- set_head_size (p, nb);
- _int_free (av, remainder, 1);
- }
+ if (remainder_size >= MINSIZE
+ || nextchunk == av->top
+ || !inuse_bit_at_offset (nextchunk, nextsize))
+ {
+ /* We can only give back the tail if it is larger than
+ MINSIZE, or if the following chunk is unused (top
+ chunk or unused in-heap chunk). Otherwise we would
+ create a chunk that is smaller than MINSIZE. */
+ remainder = chunk_at_offset (p, nb);
+ set_head_size (p, nb);
+ remainder_size = _int_free_create_chunk (av, remainder,
+ remainder_size,
+ nextchunk, nextsize);
+ _int_free_maybe_consolidate (av, remainder_size);
+ }
+ }
}
check_inuse_chunk (av, p);
--
2.41.0
From 0dc7fc1cf094406a138e4d1bcf9553e59edcf89d Mon Sep 17 00:00:00 2001
From: Florian Weimer <fweimer@redhat.com>
Date: Thu, 10 Aug 2023 19:36:56 +0200
Subject: [PATCH] malloc: Remove bin scanning from memalign (bug 30723)
On the test workload (mpv --cache=yes with VP9 video decoding), the
bin scanning has a very poor success rate (less than 2%). The tcache
scanning has about 50% success rate, so keep that.
Update comments in malloc/tst-memalign-2 to indicate the purpose
of the tests. Even with the scanning removed, the additional
merging opportunities since commit 542b1105852568c3ebc712225ae78b
("malloc: Enable merging of remainders in memalign (bug 30723)")
are sufficient to pass the existing large bins test.
Remove leftover variables from _int_free from refactoring in the
same commit.
Reviewed-by: DJ Delorie <dj@redhat.com>
---
malloc/malloc.c | 169 ++--------------------------------------
malloc/tst-memalign-2.c | 7 +-
2 files changed, 10 insertions(+), 166 deletions(-)
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 948f9759af..d0bbbf3710 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -4488,12 +4488,6 @@ _int_free (mstate av, mchunkptr p, int have_lock)
{
INTERNAL_SIZE_T size; /* its size */
mfastbinptr *fb; /* associated fastbin */
- mchunkptr nextchunk; /* next contiguous chunk */
- INTERNAL_SIZE_T nextsize; /* its size */
- int nextinuse; /* true if nextchunk is used */
- INTERNAL_SIZE_T prevsize; /* size of previous contiguous chunk */
- mchunkptr bck; /* misc temp for linking */
- mchunkptr fwd; /* misc temp for linking */
size = chunksize (p);
@@ -5032,42 +5026,6 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
------------------------------ memalign ------------------------------
*/
-/* Returns 0 if the chunk is not and does not contain the requested
- aligned sub-chunk, else returns the amount of "waste" from
- trimming. NB is the *chunk* byte size, not the user byte
- size. */
-static size_t
-chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t nb)
-{
- void *m = chunk2mem (p);
- INTERNAL_SIZE_T size = chunksize (p);
- void *aligned_m = m;
-
- if (__glibc_unlikely (misaligned_chunk (p)))
- malloc_printerr ("_int_memalign(): unaligned chunk detected");
-
- aligned_m = PTR_ALIGN_UP (m, alignment);
-
- INTERNAL_SIZE_T front_extra = (intptr_t) aligned_m - (intptr_t) m;
-
- /* We can't trim off the front as it's too small. */
- if (front_extra > 0 && front_extra < MINSIZE)
- return 0;
-
- /* If it's a perfect fit, it's an exception to the return value rule
- (we would return zero waste, which looks like "not usable"), so
- handle it here by returning a small non-zero value instead. */
- if (size == nb && front_extra == 0)
- return 1;
-
- /* If the block we need fits in the chunk, calculate total waste. */
- if (size > nb + front_extra)
- return size - nb;
-
- /* Can't use this chunk. */
- return 0;
-}
-
/* BYTES is user requested bytes, not requested chunksize bytes. */
static void *
_int_memalign (mstate av, size_t alignment, size_t bytes)
@@ -5082,7 +5040,6 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
mchunkptr remainder; /* spare room at end to split off */
unsigned long remainder_size; /* its size */
INTERNAL_SIZE_T size;
- mchunkptr victim;
nb = checked_request2size (bytes);
if (nb == 0)
@@ -5101,129 +5058,13 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
we don't find anything in those bins, the common malloc code will
scan starting at 2x. */
- /* This will be set if we found a candidate chunk. */
- victim = NULL;
-
- /* Fast bins are singly-linked, hard to remove a chunk from the middle
- and unlikely to meet our alignment requirements. We have not done
- any experimentation with searching for aligned fastbins. */
-
- if (av != NULL)
- {
- int first_bin_index;
- int first_largebin_index;
- int last_bin_index;
-
- if (in_smallbin_range (nb))
- first_bin_index = smallbin_index (nb);
- else
- first_bin_index = largebin_index (nb);
-
- if (in_smallbin_range (nb * 2))
- last_bin_index = smallbin_index (nb * 2);
- else
- last_bin_index = largebin_index (nb * 2);
-
- first_largebin_index = largebin_index (MIN_LARGE_SIZE);
-
- int victim_index; /* its bin index */
-
- for (victim_index = first_bin_index;
- victim_index < last_bin_index;
- victim_index ++)
- {
- victim = NULL;
-
- if (victim_index < first_largebin_index)
- {
- /* Check small bins. Small bin chunks are doubly-linked despite
- being the same size. */
-
- mchunkptr fwd; /* misc temp for linking */
- mchunkptr bck; /* misc temp for linking */
-
- bck = bin_at (av, victim_index);
- fwd = bck->fd;
- while (fwd != bck)
- {
- if (chunk_ok_for_memalign (fwd, alignment, nb) > 0)
- {
- victim = fwd;
-
- /* Unlink it */
- victim->fd->bk = victim->bk;
- victim->bk->fd = victim->fd;
- break;
- }
-
- fwd = fwd->fd;
- }
- }
- else
- {
- /* Check large bins. */
- mchunkptr fwd; /* misc temp for linking */
- mchunkptr bck; /* misc temp for linking */
- mchunkptr best = NULL;
- size_t best_size = 0;
-
- bck = bin_at (av, victim_index);
- fwd = bck->fd;
+ /* Call malloc with worst case padding to hit alignment. */
+ m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
- while (fwd != bck)
- {
- int extra;
-
- if (chunksize (fwd) < nb)
- break;
- extra = chunk_ok_for_memalign (fwd, alignment, nb);
- if (extra > 0
- && (extra <= best_size || best == NULL))
- {
- best = fwd;
- best_size = extra;
- }
+ if (m == 0)
+ return 0; /* propagate failure */
- fwd = fwd->fd;
- }
- victim = best;
-
- if (victim != NULL)
- {
- unlink_chunk (av, victim);
- break;
- }
- }
-
- if (victim != NULL)
- break;
- }
- }
-
- /* Strategy: find a spot within that chunk that meets the alignment
- request, and then possibly free the leading and trailing space.
- This strategy is incredibly costly and can lead to external
- fragmentation if header and footer chunks are unused. */
-
- if (victim != NULL)
- {
- p = victim;
- m = chunk2mem (p);
- set_inuse (p);
- if (av != &main_arena)
- set_non_main_arena (p);
- }
- else
- {
- /* Call malloc with worst case padding to hit alignment. */
-
- m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
-
- if (m == 0)
- return 0; /* propagate failure */
-
- p = mem2chunk (m);
- }
+ p = mem2chunk (m);
if ((((unsigned long) (m)) % alignment) != 0) /* misaligned */
{
diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
index f229283dbf..ecd6fa249e 100644
--- a/malloc/tst-memalign-2.c
+++ b/malloc/tst-memalign-2.c
@@ -86,7 +86,8 @@ do_test (void)
TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
}
- /* Test for non-head tcache hits. */
+ /* Test for non-head tcache hits. This exercises the memalign
+ scanning code to find matching allocations. */
for (i = 0; i < array_length (ptr); ++ i)
{
if (i == 4)
@@ -113,7 +114,9 @@ do_test (void)
free (p);
TEST_VERIFY (count > 0);
- /* Large bins test. */
+ /* Large bins test. This verifies that the over-allocated parts
+ that memalign releases for future allocations can be reused by
+ memalign itself at least in some cases. */
for (i = 0; i < LN; ++ i)
{
--
2.41.0