Accepting request 1107915 from Base:System

- Add cross-ppc64le package (forwarded request 1107913 from Andreas_Schwab) OBS-URL: https://build.opensuse.org/request/show/1107915 OBS-URL: https://build.opensuse.org/package/show/openSUSE:Factory/glibc?expand=0&rev=278
2023-08-30 08:19:27 +00:00
parent 09998423be 846369ca9b
commit 3481e3ba51
7 changed files with 963 additions and 9 deletions
--- a/2
+++ b/2
@@ -3,5 +3,7 @@
  <package>utils</package>
  <package>testsuite</package>
  <package>cross-aarch64</package>
+  <package>cross-ppc64le</package>
  <package>cross-riscv64</package>
+  <package>cross-s390x</package>
 </multibuild>
--- a/cache-amd-legacy.patch
+++ b/cache-amd-legacy.patch
@@ -0,0 +1,286 @@
+From ced101ed9d3b7cfd12d97ef24940cb00b8658c81 Mon Sep 17 00:00:00 2001
+From: Sajan Karumanchi <sajan.karumanchi@amd.com>
+Date: Tue, 1 Aug 2023 15:20:55 +0000
+Subject: [PATCH] x86: Fix for cache computation on AMD legacy cpus.
+
+Some legacy AMD CPUs and hypervisors have the _cpuid_ '0x8000_001D'
+set to Zero, thus resulting in zeroed-out computed cache values.
+This patch reintroduces the old way of cache computation as a
+fail-safe option to handle these exceptions.
+Fixed 'level4_cache_size' value through handle_amd().
+
+Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com>
+Tested-by: Florian Weimer <fweimer@redhat.com>
+---
+ sysdeps/x86/dl-cacheinfo.h | 226 ++++++++++++++++++++++++++++++++-----
+ 1 file changed, 199 insertions(+), 27 deletions(-)
+
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index cd4d0351ae..285773039f 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -315,40 +315,206 @@ handle_amd (int name)
+ {
+   unsigned int eax;
+   unsigned int ebx;
+-  unsigned int ecx;
+  unsigned int ecx = 0;
+   unsigned int edx;
+-  unsigned int count = 0x1;
+  unsigned int max_cpuid = 0;
+  unsigned int fn = 0;
+ 
+   /* No level 4 cache (yet).  */
+   if (name > _SC_LEVEL3_CACHE_LINESIZE)
+     return 0;
+ 
+-  if (name >= _SC_LEVEL3_CACHE_SIZE)
+-    count = 0x3;
+-  else if (name >= _SC_LEVEL2_CACHE_SIZE)
+-    count = 0x2;
+-  else if (name >= _SC_LEVEL1_DCACHE_SIZE)
+-    count = 0x0;
+  __cpuid (0x80000000, max_cpuid, ebx, ecx, edx);
+
+  if (max_cpuid >= 0x8000001D)
+    /* Use __cpuid__ '0x8000_001D' to compute cache details.  */
+    {
+      unsigned int count = 0x1;
+
+      if (name >= _SC_LEVEL3_CACHE_SIZE)
+        count = 0x3;
+      else if (name >= _SC_LEVEL2_CACHE_SIZE)
+        count = 0x2;
+      else if (name >= _SC_LEVEL1_DCACHE_SIZE)
+        count = 0x0;
+
+      __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
+
+      if (ecx != 0)
+        {
+          switch (name)
+            {
+            case _SC_LEVEL1_ICACHE_ASSOC:
+            case _SC_LEVEL1_DCACHE_ASSOC:
+            case _SC_LEVEL2_CACHE_ASSOC:
+            case _SC_LEVEL3_CACHE_ASSOC:
+              return ((ebx >> 22) & 0x3ff) + 1;
+            case _SC_LEVEL1_ICACHE_LINESIZE:
+            case _SC_LEVEL1_DCACHE_LINESIZE:
+            case _SC_LEVEL2_CACHE_LINESIZE:
+            case _SC_LEVEL3_CACHE_LINESIZE:
+              return (ebx & 0xfff) + 1;
+            case _SC_LEVEL1_ICACHE_SIZE:
+            case _SC_LEVEL1_DCACHE_SIZE:
+            case _SC_LEVEL2_CACHE_SIZE:
+            case _SC_LEVEL3_CACHE_SIZE:
+              return (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1);
+            default:
+              __builtin_unreachable ();
+            }
+          return -1;
+        }
+    }
+
+  /* Legacy cache computation for CPUs prior to Bulldozer family.
+     This is also a fail-safe mechanism for some hypervisors that
+     accidentally configure __cpuid__ '0x8000_001D' to Zero.  */
+ 
+-  __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
+  fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE);
+
+  if (max_cpuid < fn)
+    return 0;
+
+  __cpuid (fn, eax, ebx, ecx, edx);
+
+  if (name < _SC_LEVEL1_DCACHE_SIZE)
+    {
+      name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
+      ecx = edx;
+    }
+ 
+   switch (name)
+     {
+-    case _SC_LEVEL1_ICACHE_ASSOC:
+-    case _SC_LEVEL1_DCACHE_ASSOC:
+-    case _SC_LEVEL2_CACHE_ASSOC:
+      case _SC_LEVEL1_DCACHE_SIZE:
+        return (ecx >> 14) & 0x3fc00;
+
+      case _SC_LEVEL1_DCACHE_ASSOC:
+        ecx >>= 16;
+        if ((ecx & 0xff) == 0xff)
+        {
+          /* Fully associative.  */
+          return (ecx << 2) & 0x3fc00;
+        }
+        return ecx & 0xff;
+
+      case _SC_LEVEL1_DCACHE_LINESIZE:
+        return ecx & 0xff;
+
+      case _SC_LEVEL2_CACHE_SIZE:
+        return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00;
+
+      case _SC_LEVEL2_CACHE_ASSOC:
+        switch ((ecx >> 12) & 0xf)
+          {
+            case 0:
+            case 1:
+            case 2:
+            case 4:
+              return (ecx >> 12) & 0xf;
+            case 6:
+              return 8;
+            case 8:
+              return 16;
+            case 10:
+              return 32;
+            case 11:
+              return 48;
+            case 12:
+              return 64;
+            case 13:
+              return 96;
+            case 14:
+              return 128;
+            case 15:
+              return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff);
+            default:
+              return 0;
+          }
+
+      case _SC_LEVEL2_CACHE_LINESIZE:
+        return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff;
+
+      case _SC_LEVEL3_CACHE_SIZE:
+        {
+        long int total_l3_cache = 0, l3_cache_per_thread = 0;
+        unsigned int threads = 0;
+        const struct cpu_features *cpu_features;
+
+        if ((edx & 0xf000) == 0)
+          return 0;
+
+        total_l3_cache = (edx & 0x3ffc0000) << 1;
+        cpu_features = __get_cpu_features ();
+
+        /* Figure out the number of logical threads that share L3.  */
+        if (max_cpuid >= 0x80000008)
+          {
+            /* Get width of APIC ID.  */
+            __cpuid (0x80000008, eax, ebx, ecx, edx);
+            threads = (ecx & 0xff) + 1;
+          }
+
+        if (threads == 0)
+          {
+            /* If APIC ID width is not available, use logical
+            processor count.  */
+            __cpuid (0x00000001, eax, ebx, ecx, edx);
+            if ((edx & (1 << 28)) != 0)
+              threads = (ebx >> 16) & 0xff;
+          }
+
+        /* Cap usage of highest cache level to the number of
+           supported threads.  */
+        if (threads > 0)
+          l3_cache_per_thread = total_l3_cache/threads;
+
+        /* Get shared cache per ccx for Zen architectures.  */
+        if (cpu_features->basic.family >= 0x17)
+          {
+            long int l3_cache_per_ccx = 0;
+            /* Get number of threads share the L3 cache in CCX.  */
+            __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx);
+            unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
+            l3_cache_per_ccx = l3_cache_per_thread * threads_per_ccx;
+            return l3_cache_per_ccx;
+          }
+        else
+          {
+            return l3_cache_per_thread;
+          }
+      }
+
+     case _SC_LEVEL3_CACHE_ASSOC:
+-      return ecx ? ((ebx >> 22) & 0x3ff) + 1 : 0;
+-    case _SC_LEVEL1_ICACHE_LINESIZE:
+-    case _SC_LEVEL1_DCACHE_LINESIZE:
+-    case _SC_LEVEL2_CACHE_LINESIZE:
+      switch ((edx >> 12) & 0xf)
+      {
+        case 0:
+        case 1:
+        case 2:
+        case 4:
+          return (edx >> 12) & 0xf;
+        case 6:
+          return 8;
+        case 8:
+          return 16;
+        case 10:
+          return 32;
+        case 11:
+          return 48;
+        case 12:
+          return 64;
+        case 13:
+          return 96;
+        case 14:
+          return 128;
+        case 15:
+          return ((edx & 0x3ffc0000) << 1) / (edx & 0xff);
+        default:
+          return 0;
+      }
+
+     case _SC_LEVEL3_CACHE_LINESIZE:
+-      return ecx ? (ebx & 0xfff) + 1 : 0;
+-    case _SC_LEVEL1_ICACHE_SIZE:
+-    case _SC_LEVEL1_DCACHE_SIZE:
+-    case _SC_LEVEL2_CACHE_SIZE:
+-    case _SC_LEVEL3_CACHE_SIZE:
+-      return ecx ? (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1): 0;
+      return (edx & 0xf000) == 0 ? 0 : edx & 0xff;
+
+     default:
+       __builtin_unreachable ();
+     }
+@@ -703,7 +869,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+       data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
+       core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
+       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
+-      shared_per_thread = shared;
+ 
+       level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
+       level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE);
+@@ -716,13 +881,20 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+       level3_cache_size = shared;
+       level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC);
+       level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE);
+      level4_cache_size = handle_amd (_SC_LEVEL4_CACHE_SIZE);
+ 
+       if (shared <= 0)
+-        /* No shared L3 cache.  All we have is the L2 cache.  */
+-	shared = core;
+        {
+           /* No shared L3 cache.  All we have is the L2 cache.  */
+           shared = core;
+        }
+      else if (cpu_features->basic.family < 0x17)
+        {
+           /* Account for exclusive L2 and L3 caches.  */
+           shared += core;
+        }
+ 
+-      if (shared_per_thread <= 0)
+-	shared_per_thread = shared;
+      shared_per_thread = shared;
+     }
+ 
+   cpu_features->level1_icache_size = level1_icache_size;
+-- 
+2.41.0
+
--- a/cache-intel-shared.patch
+++ b/cache-intel-shared.patch
@@ -0,0 +1,45 @@
+From 5ea70cc02626d9b85f1570153873d8648a47bf95 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 10 Aug 2023 19:28:24 -0500
+Subject: [PATCH] x86: Fix incorrect scope of setting `shared_per_thread` [BZ#
+ 30745]
+
+The:
+
+```
+    if (shared_per_thread > 0 && threads > 0)
+      shared_per_thread /= threads;
+```
+
+Code was accidentally moved to inside the else scope.  This doesn't
+match how it was previously (before af992e7abd).
+
+This patch fixes that by putting the division after the `else` block.
+
+(cherry picked from commit 084fb31bc2c5f95ae0b9e6df4d3cf0ff43471ede)
+---
+ sysdeps/x86/dl-cacheinfo.h | 7 +++----
+ 1 file changed, 3 insertions(+), 4 deletions(-)
+
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index 285773039f..5ddb35c9d9 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -770,11 +770,10 @@ get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, u
+ 	     level.  */
+ 	  threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16)
+ 		     & 0xff);
+-
+-	  /* Get per-thread size of highest level cache.  */
+-	  if (shared_per_thread > 0 && threads > 0)
+-	    shared_per_thread /= threads;
+ 	}
+      /* Get per-thread size of highest level cache.  */
+      if (shared_per_thread > 0 && threads > 0)
+	shared_per_thread /= threads;
+     }
+ 
+   /* Account for non-inclusive L2 and L3 caches.  */
+-- 
+2.41.0
+
--- a/glibc-2.3.90-langpackdir.diff
+++ b/glibc-2.3.90-langpackdir.diff
@@ -1,3 +1,24 @@
+Index: glibc-2.38/debug/strcpy_chk.c
+===================================================================
+--- glibc-2.38.orig/debug/strcpy_chk.c
+++ glibc-2.38/debug/strcpy_chk.c
+@@ -31,3 +31,4 @@ __strcpy_chk (char *dest, const char *sr
+ 
+   return memcpy (dest, src, len + 1);
+ }
+libc_hidden_builtin_def (__strcpy_chk)
+Index: glibc-2.38/include/string.h
+===================================================================
+--- glibc-2.38.orig/include/string.h
+++ glibc-2.38/include/string.h
+@@ -213,6 +213,7 @@ libc_hidden_builtin_proto (__memcpy_chk)
+ libc_hidden_builtin_proto (__memmove_chk)
+ libc_hidden_builtin_proto (__mempcpy_chk)
+ libc_hidden_builtin_proto (__memset_chk)
+libc_hidden_builtin_proto (__strcpy_chk)
+ libc_hidden_builtin_proto (__stpcpy_chk)
+ 
+ #endif
 Index: glibc-2.38/intl/loadmsgcat.c
 ===================================================================
 --- glibc-2.38.orig/intl/loadmsgcat.c
--- a/glibc.changes
+++ b/glibc.changes
@@ -1,3 +1,29 @@
+Mon Aug 28 11:56:10 UTC 2023 - Richard Biener <rguenther@suse.com>
+
+- Add cross-ppc64le package
+
+-------------------------------------------------------------------
+Tue Aug 22 15:46:51 UTC 2023 - Andreas Schwab <schwab@suse.de>
+
+- posix-memalign-fragmentation.patch: malloc: Enable merging of remainders
+  in memalign, remove bin scanning from memalign (BZ #30723)
+- Limit build counter sync to i686 flavor, to reduce needs for rebuilds
+
+-------------------------------------------------------------------
+Tue Aug 22 11:24:16 UTC 2023 - Richard Biener <rguenther@suse.com>
+
+- Add cross-s390x package (bsc#1214460)
+
+-------------------------------------------------------------------
+Mon Aug 14 08:12:28 UTC 2023 - Andreas Schwab <schwab@suse.de>
+
+- Require that elf/check-localplt does not fail
+- glibc-2.3.90-langpackdir.diff: add hidden alias for __strcpy_chk
+- cache-amd-legacy.patch: x86: Fix for cache computation on AMD legacy
+  cpus
+- cache-intel-shared.patch: x86: Fix incorrect scope of setting
+  `shared_per_thread` (BZ# 30745)
+
 -------------------------------------------------------------------
 Wed Aug  2 10:50:32 UTC 2023 - Andreas Schwab <schwab@suse.de>

--- a/glibc.spec
+++ b/glibc.spec
@@ -30,15 +30,23 @@
 # can use only simple RPM expressions, no lua, no shell, no '{expand:'
 # expression :-/  Ideally we'd like to just strip the 'cross_' prefix,
 # but we can't.  So enumerate the possibilities for now.
+%define cross_arch %{cross_cpu}
 %if "%flavor" == "cross-aarch64"
-%define cross_arch aarch64
+%define cross_cpu aarch64
 %endif
 %if "%flavor" == "cross-riscv64"
-%define cross_arch riscv64
+%define cross_cpu riscv64
+%endif
+%if "%flavor" == "cross-s390x"
+%define cross_cpu s390x
+%endif
+%if "%flavor" == "cross-ppc64le"
+%define cross_arch ppc64le
+%define cross_cpu powerpc64le
 %endif

-%if 0%{?cross_arch:1}
-%define binutils_os %{cross_arch}-suse-linux
+%if 0%{?cross_cpu:1}
+%define binutils_os %{cross_cpu}-suse-linux
 # use same sysroot as in binutils.spec
 %define sysroot %{_prefix}/%{binutils_os}/sys-root
 %endif
@@ -81,7 +89,7 @@ ExclusiveArch:  do_not_build
 %define build_utils 0
 %define build_testsuite 1
 %endif
-%if 0%{?cross_arch:1}
+%if 0%{?cross_cpu:1}
 %define build_main 0
 %define build_utils 0
 %define build_testsuite 0
@@ -89,7 +97,7 @@ ExclusiveArch:  do_not_build
 %undefine _build_create_debug
 ExcludeArch:    %{cross_arch}
 %endif
-%define host_arch %{?cross_arch}%{!?cross_arch:%{_target_cpu}}
+%define host_arch %{?cross_cpu}%{!?cross_cpu:%{_target_cpu}}

 %if %{build_main}
 %define name_suffix %{nil}
@@ -232,6 +240,8 @@ BuildRequires:  cross-%{cross_arch}-linux-glibc-devel
 %if "%flavor" == "i686"
 ExclusiveArch:  i586 i686
 BuildArch:      i686
+# Sync only this build counter with the main build
+#!BcntSyncTag:  glibc
 %endif

 ###
@@ -289,8 +299,14 @@ Patch306:       glibc-fix-double-loopback.diff
 ###
 # Patches from upstream
 ###
-# PATCH-FIX-OPENSUSE iconv: restore verbosity with unrecognized encoding names (BZ #30694)
+# PATCH-FIX-UPSTREAM iconv: restore verbosity with unrecognized encoding names (BZ #30694)
 Patch1000:      iconv-error-verbosity.patch
+# PATCH-FIX-UPSTREAM x86: Fix for cache computation on AMD legacy cpus
+Patch1001:      cache-amd-legacy.patch
+# PATCH-FIX-UPSTREAM x86: Fix incorrect scope of setting `shared_per_thread` (BZ# 30745)
+Patch1002:      cache-intel-shared.patch
+# PATCH-FIX-UPSTREAM malloc: Enable merging of remainders in memalign, remove bin scanning from memalign (BZ #30723)
+Patch1003:      posix-memalign-fragmentation.patch

 ###
 # Patches awaiting upstream approval
@@ -515,6 +531,9 @@ library in a cross compilation setting.

 %if %{without snapshot}
 %patch1000 -p1
+%patch1001 -p1
+%patch1002 -p1
+%patch1003 -p1
 %endif

 %patch2000 -p1
@@ -590,8 +609,8 @@ BuildCCplus="%__cxx"
 #now overwrite for some architectures
 #
 %if %{build_cross}
-BuildCC=%{cross_arch}-suse-linux-gcc
-BuildCCplus=%{cross_arch}-suse-linux-g++
+BuildCC=%{cross_cpu}-suse-linux-gcc
+BuildCCplus=%{cross_cpu}-suse-linux-g++
 %else
 %ifarch sparc64
 	BuildFlags="-O2 -mcpu=ultrasparc -mvis -fcall-used-g6"
@@ -754,6 +773,7 @@ make %{?_smp_mflags} %{?make_output_sync} -C cc-base -k check || {
 # Exceptions:
 # None!
 make %{?_smp_mflags} %{?make_output_sync} -C cc-base check-abi
+make %{?_smp_mflags} %{?make_output_sync} -C cc-base test t=elf/check-localplt
 %endif

 %define rtldlib %{_lib}
--- a/posix-memalign-fragmentation.patch
+++ b/posix-memalign-fragmentation.patch
@@ -0,0 +1,554 @@
+From 542b1105852568c3ebc712225ae78b8c8ba31a78 Mon Sep 17 00:00:00 2001
+From: Florian Weimer <fweimer@redhat.com>
+Date: Fri, 11 Aug 2023 11:18:17 +0200
+Subject: [PATCH] malloc: Enable merging of remainders in memalign (bug 30723)
+
+Previously, calling _int_free from _int_memalign could put remainders
+into the tcache or into fastbins, where they are invisible to the
+low-level allocator.  This results in missed merge opportunities
+because once these freed chunks become available to the low-level
+allocator, further memalign allocations (even of the same size are)
+likely obstructing merges.
+
+Furthermore, during forwards merging in _int_memalign, do not
+completely give up when the remainder is too small to serve as a
+chunk on its own.  We can still give it back if it can be merged
+with the following unused chunk.  This makes it more likely that
+memalign calls in a loop achieve a compact memory layout,
+independently of initial heap layout.
+
+Drop some useless (unsigned long) casts along the way, and tweak
+the style to more closely match GNU on changed lines.
+
+Reviewed-by: DJ Delorie <dj@redhat.com>
+---
+ malloc/malloc.c | 197 +++++++++++++++++++++++++++++-------------------
+ 1 file changed, 121 insertions(+), 76 deletions(-)
+
+diff --git a/malloc/malloc.c b/malloc/malloc.c
+index e2f1a615a4..948f9759af 100644
+--- a/malloc/malloc.c
+++ b/malloc/malloc.c
+@@ -1086,6 +1086,11 @@ typedef struct malloc_chunk* mchunkptr;
+ 
+ static void*  _int_malloc(mstate, size_t);
+ static void     _int_free(mstate, mchunkptr, int);
+static void _int_free_merge_chunk (mstate, mchunkptr, INTERNAL_SIZE_T);
+static INTERNAL_SIZE_T _int_free_create_chunk (mstate,
+					       mchunkptr, INTERNAL_SIZE_T,
+					       mchunkptr, INTERNAL_SIZE_T);
+static void _int_free_maybe_consolidate (mstate, INTERNAL_SIZE_T);
+ static void*  _int_realloc(mstate, mchunkptr, INTERNAL_SIZE_T,
+ 			   INTERNAL_SIZE_T);
+ static void*  _int_memalign(mstate, size_t, size_t);
+@@ -4637,31 +4642,52 @@ _int_free (mstate av, mchunkptr p, int have_lock)
+     if (!have_lock)
+       __libc_lock_lock (av->mutex);
+ 
+-    nextchunk = chunk_at_offset(p, size);
+-
+-    /* Lightweight tests: check whether the block is already the
+-       top block.  */
+-    if (__glibc_unlikely (p == av->top))
+-      malloc_printerr ("double free or corruption (top)");
+-    /* Or whether the next chunk is beyond the boundaries of the arena.  */
+-    if (__builtin_expect (contiguous (av)
+-			  && (char *) nextchunk
+-			  >= ((char *) av->top + chunksize(av->top)), 0))
+-	malloc_printerr ("double free or corruption (out)");
+-    /* Or whether the block is actually not marked used.  */
+-    if (__glibc_unlikely (!prev_inuse(nextchunk)))
+-      malloc_printerr ("double free or corruption (!prev)");
+-
+-    nextsize = chunksize(nextchunk);
+-    if (__builtin_expect (chunksize_nomask (nextchunk) <= CHUNK_HDR_SZ, 0)
+-	|| __builtin_expect (nextsize >= av->system_mem, 0))
+-      malloc_printerr ("free(): invalid next size (normal)");
+    _int_free_merge_chunk (av, p, size);
+ 
+-    free_perturb (chunk2mem(p), size - CHUNK_HDR_SZ);
+    if (!have_lock)
+      __libc_lock_unlock (av->mutex);
+  }
+  /*
+    If the chunk was allocated via mmap, release via munmap().
+  */
+
+  else {
+    munmap_chunk (p);
+  }
+}
+
+/* Try to merge chunk P of SIZE bytes with its neighbors.  Put the
+   resulting chunk on the appropriate bin list.  P must not be on a
+   bin list yet, and it can be in use.  */
+static void
+_int_free_merge_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size)
+{
+  mchunkptr nextchunk = chunk_at_offset(p, size);
+
+  /* Lightweight tests: check whether the block is already the
+     top block.  */
+  if (__glibc_unlikely (p == av->top))
+    malloc_printerr ("double free or corruption (top)");
+  /* Or whether the next chunk is beyond the boundaries of the arena.  */
+  if (__builtin_expect (contiguous (av)
+			&& (char *) nextchunk
+			>= ((char *) av->top + chunksize(av->top)), 0))
+    malloc_printerr ("double free or corruption (out)");
+  /* Or whether the block is actually not marked used.  */
+  if (__glibc_unlikely (!prev_inuse(nextchunk)))
+    malloc_printerr ("double free or corruption (!prev)");
+
+  INTERNAL_SIZE_T nextsize = chunksize(nextchunk);
+  if (__builtin_expect (chunksize_nomask (nextchunk) <= CHUNK_HDR_SZ, 0)
+      || __builtin_expect (nextsize >= av->system_mem, 0))
+    malloc_printerr ("free(): invalid next size (normal)");
+
+  free_perturb (chunk2mem(p), size - CHUNK_HDR_SZ);
+ 
+-    /* consolidate backward */
+-    if (!prev_inuse(p)) {
+-      prevsize = prev_size (p);
+  /* Consolidate backward.  */
+  if (!prev_inuse(p))
+    {
+      INTERNAL_SIZE_T prevsize = prev_size (p);
+       size += prevsize;
+       p = chunk_at_offset(p, -((long) prevsize));
+       if (__glibc_unlikely (chunksize(p) != prevsize))
+@@ -4669,9 +4695,25 @@ _int_free (mstate av, mchunkptr p, int have_lock)
+       unlink_chunk (av, p);
+     }
+ 
+-    if (nextchunk != av->top) {
+  /* Write the chunk header, maybe after merging with the following chunk.  */
+  size = _int_free_create_chunk (av, p, size, nextchunk, nextsize);
+  _int_free_maybe_consolidate (av, size);
+}
+
+/* Create a chunk at P of SIZE bytes, with SIZE potentially increased
+   to cover the immediately following chunk NEXTCHUNK of NEXTSIZE
+   bytes (if NEXTCHUNK is unused).  The chunk at P is not actually
+   read and does not have to be initialized.  After creation, it is
+   placed on the appropriate bin list.  The function returns the size
+   of the new chunk.  */
+static INTERNAL_SIZE_T
+_int_free_create_chunk (mstate av, mchunkptr p, INTERNAL_SIZE_T size,
+			mchunkptr nextchunk, INTERNAL_SIZE_T nextsize)
+{
+  if (nextchunk != av->top)
+    {
+       /* get and clear inuse bit */
+-      nextinuse = inuse_bit_at_offset(nextchunk, nextsize);
+      bool nextinuse = inuse_bit_at_offset (nextchunk, nextsize);
+ 
+       /* consolidate forward */
+       if (!nextinuse) {
+@@ -4686,8 +4728,8 @@ _int_free (mstate av, mchunkptr p, int have_lock)
+ 	been given one chance to be used in malloc.
+       */
+ 
+-      bck = unsorted_chunks(av);
+-      fwd = bck->fd;
+      mchunkptr bck = unsorted_chunks (av);
+      mchunkptr fwd = bck->fd;
+       if (__glibc_unlikely (fwd->bk != bck))
+ 	malloc_printerr ("free(): corrupted unsorted chunks");
+       p->fd = fwd;
+@@ -4706,61 +4748,52 @@ _int_free (mstate av, mchunkptr p, int have_lock)
+       check_free_chunk(av, p);
+     }
+ 
+-    /*
+-      If the chunk borders the current high end of memory,
+-      consolidate into top
+-    */
+-
+-    else {
+  else
+    {
+      /* If the chunk borders the current high end of memory,
+	 consolidate into top.  */
+       size += nextsize;
+       set_head(p, size | PREV_INUSE);
+       av->top = p;
+       check_chunk(av, p);
+     }
+ 
+-    /*
+-      If freeing a large space, consolidate possibly-surrounding
+-      chunks. Then, if the total unused topmost memory exceeds trim
+-      threshold, ask malloc_trim to reduce top.
+-
+-      Unless max_fast is 0, we don't know if there are fastbins
+-      bordering top, so we cannot tell for sure whether threshold
+-      has been reached unless fastbins are consolidated.  But we
+-      don't want to consolidate on each free.  As a compromise,
+-      consolidation is performed if FASTBIN_CONSOLIDATION_THRESHOLD
+-      is reached.
+-    */
+  return size;
+}
+ 
+-    if ((unsigned long)(size) >= FASTBIN_CONSOLIDATION_THRESHOLD) {
+/* If freeing a large space, consolidate possibly-surrounding
+   chunks.  Then, if the total unused topmost memory exceeds trim
+   threshold, ask malloc_trim to reduce top.  */
+static void
+_int_free_maybe_consolidate (mstate av, INTERNAL_SIZE_T size)
+{
+  /* Unless max_fast is 0, we don't know if there are fastbins
+     bordering top, so we cannot tell for sure whether threshold has
+     been reached unless fastbins are consolidated.  But we don't want
+     to consolidate on each free.  As a compromise, consolidation is
+     performed if FASTBIN_CONSOLIDATION_THRESHOLD is reached.  */
+  if (size >= FASTBIN_CONSOLIDATION_THRESHOLD)
+    {
+       if (atomic_load_relaxed (&av->have_fastchunks))
+ 	malloc_consolidate(av);
+ 
+-      if (av == &main_arena) {
+      if (av == &main_arena)
+	{
+ #ifndef MORECORE_CANNOT_TRIM
+-	if ((unsigned long)(chunksize(av->top)) >=
+-	    (unsigned long)(mp_.trim_threshold))
+-	  systrim(mp_.top_pad, av);
+	  if (chunksize (av->top) >= mp_.trim_threshold)
+	    systrim (mp_.top_pad, av);
+ #endif
+-      } else {
+-	/* Always try heap_trim(), even if the top chunk is not
+-	   large, because the corresponding heap might go away.  */
+-	heap_info *heap = heap_for_ptr(top(av));
+	}
+      else
+	{
+	  /* Always try heap_trim, even if the top chunk is not large,
+	     because the corresponding heap might go away.  */
+	  heap_info *heap = heap_for_ptr (top (av));
+ 
+-	assert(heap->ar_ptr == av);
+-	heap_trim(heap, mp_.top_pad);
+-      }
+	  assert (heap->ar_ptr == av);
+	  heap_trim (heap, mp_.top_pad);
+	}
+     }
+-
+-    if (!have_lock)
+-      __libc_lock_unlock (av->mutex);
+-  }
+-  /*
+-    If the chunk was allocated via mmap, release via munmap().
+-  */
+-
+-  else {
+-    munmap_chunk (p);
+-  }
+ }
+ 
+ /*
+@@ -5221,7 +5254,7 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
+                 (av != &main_arena ? NON_MAIN_ARENA : 0));
+       set_inuse_bit_at_offset (newp, newsize);
+       set_head_size (p, leadsize | (av != &main_arena ? NON_MAIN_ARENA : 0));
+-      _int_free (av, p, 1);
+      _int_free_merge_chunk (av, p, leadsize);
+       p = newp;
+ 
+       assert (newsize >= nb &&
+@@ -5232,15 +5265,27 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
+   if (!chunk_is_mmapped (p))
+     {
+       size = chunksize (p);
+-      if ((unsigned long) (size) > (unsigned long) (nb + MINSIZE))
+      mchunkptr nextchunk = chunk_at_offset(p, size);
+      INTERNAL_SIZE_T nextsize = chunksize(nextchunk);
+      if (size > nb)
+         {
+           remainder_size = size - nb;
+-          remainder = chunk_at_offset (p, nb);
+-          set_head (remainder, remainder_size | PREV_INUSE |
+-                    (av != &main_arena ? NON_MAIN_ARENA : 0));
+-          set_head_size (p, nb);
+-          _int_free (av, remainder, 1);
+-        }
+	  if (remainder_size >= MINSIZE
+	      || nextchunk == av->top
+	      || !inuse_bit_at_offset (nextchunk, nextsize))
+	    {
+	      /* We can only give back the tail if it is larger than
+		 MINSIZE, or if the following chunk is unused (top
+		 chunk or unused in-heap chunk).  Otherwise we would
+		 create a chunk that is smaller than MINSIZE.  */
+	      remainder = chunk_at_offset (p, nb);
+	      set_head_size (p, nb);
+	      remainder_size = _int_free_create_chunk (av, remainder,
+						       remainder_size,
+						       nextchunk, nextsize);
+	      _int_free_maybe_consolidate (av, remainder_size);
+	    }
+	}
+     }
+ 
+   check_inuse_chunk (av, p);
+-- 
+2.41.0
+
+From 0dc7fc1cf094406a138e4d1bcf9553e59edcf89d Mon Sep 17 00:00:00 2001
+From: Florian Weimer <fweimer@redhat.com>
+Date: Thu, 10 Aug 2023 19:36:56 +0200
+Subject: [PATCH] malloc: Remove bin scanning from memalign (bug 30723)
+
+On the test workload (mpv --cache=yes with VP9 video decoding), the
+bin scanning has a very poor success rate (less than 2%).  The tcache
+scanning has about 50% success rate, so keep that.
+
+Update comments in malloc/tst-memalign-2 to indicate the purpose
+of the tests.  Even with the scanning removed, the additional
+merging opportunities since commit 542b1105852568c3ebc712225ae78b
+("malloc: Enable merging of remainders in memalign (bug 30723)")
+are sufficient to pass the existing large bins test.
+
+Remove leftover variables from _int_free from refactoring in the
+same commit.
+
+Reviewed-by: DJ Delorie <dj@redhat.com>
+---
+ malloc/malloc.c         | 169 ++--------------------------------------
+ malloc/tst-memalign-2.c |   7 +-
+ 2 files changed, 10 insertions(+), 166 deletions(-)
+
+diff --git a/malloc/malloc.c b/malloc/malloc.c
+index 948f9759af..d0bbbf3710 100644
+--- a/malloc/malloc.c
+++ b/malloc/malloc.c
+@@ -4488,12 +4488,6 @@ _int_free (mstate av, mchunkptr p, int have_lock)
+ {
+   INTERNAL_SIZE_T size;        /* its size */
+   mfastbinptr *fb;             /* associated fastbin */
+-  mchunkptr nextchunk;         /* next contiguous chunk */
+-  INTERNAL_SIZE_T nextsize;    /* its size */
+-  int nextinuse;               /* true if nextchunk is used */
+-  INTERNAL_SIZE_T prevsize;    /* size of previous contiguous chunk */
+-  mchunkptr bck;               /* misc temp for linking */
+-  mchunkptr fwd;               /* misc temp for linking */
+ 
+   size = chunksize (p);
+ 
+@@ -5032,42 +5026,6 @@ _int_realloc (mstate av, mchunkptr oldp, INTERNAL_SIZE_T oldsize,
+    ------------------------------ memalign ------------------------------
+  */
+ 
+-/* Returns 0 if the chunk is not and does not contain the requested
+-   aligned sub-chunk, else returns the amount of "waste" from
+-   trimming.  NB is the *chunk* byte size, not the user byte
+-   size.  */
+-static size_t
+-chunk_ok_for_memalign (mchunkptr p, size_t alignment, size_t nb)
+-{
+-  void *m = chunk2mem (p);
+-  INTERNAL_SIZE_T size = chunksize (p);
+-  void *aligned_m = m;
+-
+-  if (__glibc_unlikely (misaligned_chunk (p)))
+-    malloc_printerr ("_int_memalign(): unaligned chunk detected");
+-
+-  aligned_m = PTR_ALIGN_UP (m, alignment);
+-
+-  INTERNAL_SIZE_T front_extra = (intptr_t) aligned_m - (intptr_t) m;
+-
+-  /* We can't trim off the front as it's too small.  */
+-  if (front_extra > 0 && front_extra < MINSIZE)
+-    return 0;
+-
+-  /* If it's a perfect fit, it's an exception to the return value rule
+-     (we would return zero waste, which looks like "not usable"), so
+-     handle it here by returning a small non-zero value instead.  */
+-  if (size == nb && front_extra == 0)
+-    return 1;
+-
+-  /* If the block we need fits in the chunk, calculate total waste.  */
+-  if (size > nb + front_extra)
+-    return size - nb;
+-
+-  /* Can't use this chunk.  */
+-  return 0;
+-}
+-
+ /* BYTES is user requested bytes, not requested chunksize bytes.  */
+ static void *
+ _int_memalign (mstate av, size_t alignment, size_t bytes)
+@@ -5082,7 +5040,6 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
+   mchunkptr remainder;            /* spare room at end to split off */
+   unsigned long remainder_size;   /* its size */
+   INTERNAL_SIZE_T size;
+-  mchunkptr victim;
+ 
+   nb = checked_request2size (bytes);
+   if (nb == 0)
+@@ -5101,129 +5058,13 @@ _int_memalign (mstate av, size_t alignment, size_t bytes)
+      we don't find anything in those bins, the common malloc code will
+      scan starting at 2x.  */
+ 
+-  /* This will be set if we found a candidate chunk.  */
+-  victim = NULL;
+-
+-  /* Fast bins are singly-linked, hard to remove a chunk from the middle
+-     and unlikely to meet our alignment requirements.  We have not done
+-     any experimentation with searching for aligned fastbins.  */
+-
+-  if (av != NULL)
+-    {
+-      int first_bin_index;
+-      int first_largebin_index;
+-      int last_bin_index;
+-
+-      if (in_smallbin_range (nb))
+-	first_bin_index = smallbin_index (nb);
+-      else
+-	first_bin_index = largebin_index (nb);
+-
+-      if (in_smallbin_range (nb * 2))
+-	last_bin_index = smallbin_index (nb * 2);
+-      else
+-	last_bin_index = largebin_index (nb * 2);
+-
+-      first_largebin_index = largebin_index (MIN_LARGE_SIZE);
+-
+-      int victim_index;                 /* its bin index */
+-
+-      for (victim_index = first_bin_index;
+-	   victim_index < last_bin_index;
+-	   victim_index ++)
+-	{
+-	  victim = NULL;
+-
+-	  if (victim_index < first_largebin_index)
+-	    {
+-	      /* Check small bins.  Small bin chunks are doubly-linked despite
+-		 being the same size.  */
+-
+-	      mchunkptr fwd;                    /* misc temp for linking */
+-	      mchunkptr bck;                    /* misc temp for linking */
+-
+-	      bck = bin_at (av, victim_index);
+-	      fwd = bck->fd;
+-	      while (fwd != bck)
+-		{
+-		  if (chunk_ok_for_memalign (fwd, alignment, nb) > 0)
+-		    {
+-		      victim = fwd;
+-
+-		      /* Unlink it */
+-		      victim->fd->bk = victim->bk;
+-		      victim->bk->fd = victim->fd;
+-		      break;
+-		    }
+-
+-		  fwd = fwd->fd;
+-		}
+-	    }
+-	  else
+-	    {
+-	      /* Check large bins.  */
+-	      mchunkptr fwd;                    /* misc temp for linking */
+-	      mchunkptr bck;                    /* misc temp for linking */
+-	      mchunkptr best = NULL;
+-	      size_t best_size = 0;
+-
+-	      bck = bin_at (av, victim_index);
+-	      fwd = bck->fd;
+  /* Call malloc with worst case padding to hit alignment. */
+  m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
+ 
+-	      while (fwd != bck)
+-		{
+-		  int extra;
+-
+-		  if (chunksize (fwd) < nb)
+-		    break;
+-		  extra = chunk_ok_for_memalign (fwd, alignment, nb);
+-		  if (extra > 0
+-		      && (extra <= best_size || best == NULL))
+-		    {
+-		      best = fwd;
+-		      best_size = extra;
+-		    }
+  if (m == 0)
+    return 0;           /* propagate failure */
+ 
+-		  fwd = fwd->fd;
+-		}
+-	      victim = best;
+-
+-	      if (victim != NULL)
+-		{
+-		  unlink_chunk (av, victim);
+-		  break;
+-		}
+-	    }
+-
+-	  if (victim != NULL)
+-	    break;
+-	}
+-    }
+-
+-  /* Strategy: find a spot within that chunk that meets the alignment
+-     request, and then possibly free the leading and trailing space.
+-     This strategy is incredibly costly and can lead to external
+-     fragmentation if header and footer chunks are unused.  */
+-
+-  if (victim != NULL)
+-    {
+-      p = victim;
+-      m = chunk2mem (p);
+-      set_inuse (p);
+-      if (av != &main_arena)
+-	set_non_main_arena (p);
+-    }
+-  else
+-    {
+-      /* Call malloc with worst case padding to hit alignment. */
+-
+-      m = (char *) (_int_malloc (av, nb + alignment + MINSIZE));
+-
+-      if (m == 0)
+-	return 0;           /* propagate failure */
+-
+-      p = mem2chunk (m);
+-    }
+  p = mem2chunk (m);
+ 
+   if ((((unsigned long) (m)) % alignment) != 0)   /* misaligned */
+     {
+diff --git a/malloc/tst-memalign-2.c b/malloc/tst-memalign-2.c
+index f229283dbf..ecd6fa249e 100644
+--- a/malloc/tst-memalign-2.c
+++ b/malloc/tst-memalign-2.c
+@@ -86,7 +86,8 @@ do_test (void)
+       TEST_VERIFY (tcache_allocs[i].ptr1 == tcache_allocs[i].ptr2);
+     }
+ 
+-  /* Test for non-head tcache hits.  */
+  /* Test for non-head tcache hits.  This exercises the memalign
+     scanning code to find matching allocations.  */
+   for (i = 0; i < array_length (ptr); ++ i)
+     {
+       if (i == 4)
+@@ -113,7 +114,9 @@ do_test (void)
+   free (p);
+   TEST_VERIFY (count > 0);
+ 
+-  /* Large bins test.  */
+  /* Large bins test.  This verifies that the over-allocated parts
+     that memalign releases for future allocations can be reused by
+     memalign itself at least in some cases.  */
+ 
+   for (i = 0; i < LN; ++ i)
+     {
+-- 
+2.41.0
+