SHA256
1
0
forked from pool/glibc

Accepting request 311392 from Base:System

- Add /usr/include/gnu/lib-names-.*.h to baselibs
- pthread-join-deadlock.patch: Don't require rtld lock to store static TLS
  offset in the DTV (bsc#930015, BZ #18457)
- heap-top-corruption.patch: Do not corrupt the top of a threaded heap if
  top chunk is MINSIZE (BZ #18502) (forwarded request 311391 from Andreas_Schwab)

OBS-URL: https://build.opensuse.org/request/show/311392
OBS-URL: https://build.opensuse.org/package/show/openSUSE:Factory/glibc?expand=0&rev=191
This commit is contained in:
Stephan Kulow 2015-06-16 12:04:08 +00:00 committed by Git OBS Bridge
parent 17572084d9
commit ab29d5500a
10 changed files with 527 additions and 27 deletions

View File

@ -14,6 +14,7 @@ glibc-locale
glibc-devel
requires "glibc-<targettype> = %version"
arch i586 block!
+^/usr/include/gnu/lib-names-.*\.h$
+^/usr/include/gnu/stubs-.*\.h$
glibc-devel-static
arch i586 block!

View File

@ -1,3 +1,18 @@
-------------------------------------------------------------------
Tue Jun 9 08:16:46 UTC 2015 - schwab@suse.de
- Add /usr/include/gnu/lib-names-.*.h to baselibs
- pthread-join-deadlock.patch: Don't require rtld lock to store static TLS
offset in the DTV (bsc#930015, BZ #18457)
- heap-top-corruption.patch: Do not corrupt the top of a threaded heap if
top chunk is MINSIZE (BZ #18502)
-------------------------------------------------------------------
Wed Apr 8 12:50:39 UTC 2015 - mgorman@suse.com
- threaded-trim-threshold.patch: Fix regression in threaded application
malloc performance (bsc#915955, BZ #17195)
-------------------------------------------------------------------
Thu Apr 2 08:11:20 UTC 2015 - schwab@suse.de

View File

@ -243,6 +243,14 @@ Patch1003: pthread-mutexattr-gettype-kind.patch
Patch1004: powerpc-software-sqrt.patch
# PATCH-FIX-UPSTREAM Fix DTV race, assert, DTV_SURPLUS Static TLS limit, and nptl_db garbage (bsc#919678, BZ #17090, BZ #17620, BZ #17621, BZ #17628)
Patch1005: static-tls-dtv-limit.patch
# PATCH-FIX-UPSTREAM Fix regression in threaded application malloc performance (bsc#915955, BZ #17195)
Patch1006: threaded-trim-threshold.patch
# PATCH-FIX-UPSTREAM Simplify handling of nameserver configuration in resolver
Patch1007: resolv-nameserver-handling.patch
# PATCH-FIX-UPSTREAM Separate internal state between getXXent and getXXbyYY NSS calls (bsc#918187, BZ #18007)
Patch1008: nss-separate-state-getXXent.patch
# PATCH-FIX-UPSTREAM aarch64: Increase MINSIGSTKSZ and SIGSTKSZ (BZ #16850)
Patch1009: aarch64-sigstksz.patch
###
# Patches awaiting upstream approval
@ -257,12 +265,10 @@ Patch2003: abort-no-flush.patch
Patch2005: glibc-memset-nontemporal.diff
# PATCH-FIX-UPSTREAM Avoid redundant shift character in iconv output at block boundary (BZ #17197)
Patch2006: ibm93x-redundant-shift-si.patch
# PATCH-FIX-UPSTREAM Rewrite handling of nameserver configuration in resolver
Patch2007: resolv-nameserver-handling.patch
# PATCH-FIX-UPSTREAM Separate internal state between getXXent and getXXbyYY NSS calls (bsc#918187, BZ #18007)
Patch2008: nss-separate-state-getXXent.patch
# PATCH-FIX-UPSTREAM aarch64: Increase MINSIGSTKSZ and SIGSTKSZ (BZ #16850)
Patch2009: aarch64-sigstksz.patch
# PATCH-FIX-UPSTREAM Don't require rtld lock to store static TLS offset in the DTV (BZ #18457)
Patch2007: pthread-join-deadlock.patch
# PATCH-FIX-UPSTREAM malloc: Do not corrupt the top of a threaded heap if top chunk is MINSIZE (BZ #18502)
Patch2008: heap-top-corruption.patch
# Non-glibc patches
# PATCH-FIX-OPENSUSE Remove debianisms from manpages
@ -469,6 +475,10 @@ rm nscd/s-stamp
%patch1003 -p1
%patch1004 -p1
%patch1005 -p1
%patch1006 -p1
%patch1007 -p1
%patch1008 -p1
%patch1009 -p1
%patch2000 -p1
%patch2002 -p1
@ -477,7 +487,6 @@ rm nscd/s-stamp
%patch2006 -p1
%patch2007 -p1
%patch2008 -p1
%patch2009 -p1
%patch3000
@ -917,8 +926,8 @@ touch %{buildroot}/run/nscd/{socket,nscd.pid}
# Create ld.so.conf
#
cat > %{buildroot}/etc/ld.so.conf <<EOF
%if "%{_lib}" == "lib64"
/usr/local/lib64
%if "%{_lib}" != "lib"
/usr/local/%{_lib}
%endif
%ifarch ppc
/usr/local/lib64

View File

@ -1,3 +1,18 @@
-------------------------------------------------------------------
Tue Jun 9 08:16:46 UTC 2015 - schwab@suse.de
- Add /usr/include/gnu/lib-names-.*.h to baselibs
- pthread-join-deadlock.patch: Don't require rtld lock to store static TLS
offset in the DTV (bsc#930015, BZ #18457)
- heap-top-corruption.patch: Do not corrupt the top of a threaded heap if
top chunk is MINSIZE (BZ #18502)
-------------------------------------------------------------------
Wed Apr 8 12:50:39 UTC 2015 - mgorman@suse.com
- threaded-trim-threshold.patch: Fix regression in threaded application
malloc performance (bsc#915955, BZ #17195)
-------------------------------------------------------------------
Thu Apr 2 08:11:20 UTC 2015 - schwab@suse.de

View File

@ -242,6 +242,14 @@ Patch1003: pthread-mutexattr-gettype-kind.patch
Patch1004: powerpc-software-sqrt.patch
# PATCH-FIX-UPSTREAM Fix DTV race, assert, DTV_SURPLUS Static TLS limit, and nptl_db garbage (bsc#919678, BZ #17090, BZ #17620, BZ #17621, BZ #17628)
Patch1005: static-tls-dtv-limit.patch
# PATCH-FIX-UPSTREAM Fix regression in threaded application malloc performance (bsc#915955, BZ #17195)
Patch1006: threaded-trim-threshold.patch
# PATCH-FIX-UPSTREAM Simplify handling of nameserver configuration in resolver
Patch1007: resolv-nameserver-handling.patch
# PATCH-FIX-UPSTREAM Separate internal state between getXXent and getXXbyYY NSS calls (bsc#918187, BZ #18007)
Patch1008: nss-separate-state-getXXent.patch
# PATCH-FIX-UPSTREAM aarch64: Increase MINSIGSTKSZ and SIGSTKSZ (BZ #16850)
Patch1009: aarch64-sigstksz.patch
###
# Patches awaiting upstream approval
@ -256,12 +264,10 @@ Patch2003: abort-no-flush.patch
Patch2005: glibc-memset-nontemporal.diff
# PATCH-FIX-UPSTREAM Avoid redundant shift character in iconv output at block boundary (BZ #17197)
Patch2006: ibm93x-redundant-shift-si.patch
# PATCH-FIX-UPSTREAM Rewrite handling of nameserver configuration in resolver
Patch2007: resolv-nameserver-handling.patch
# PATCH-FIX-UPSTREAM Separate internal state between getXXent and getXXbyYY NSS calls (bsc#918187, BZ #18007)
Patch2008: nss-separate-state-getXXent.patch
# PATCH-FIX-UPSTREAM aarch64: Increase MINSIGSTKSZ and SIGSTKSZ (BZ #16850)
Patch2009: aarch64-sigstksz.patch
# PATCH-FIX-UPSTREAM Don't require rtld lock to store static TLS offset in the DTV (BZ #18457)
Patch2007: pthread-join-deadlock.patch
# PATCH-FIX-UPSTREAM malloc: Do not corrupt the top of a threaded heap if top chunk is MINSIZE (BZ #18502)
Patch2008: heap-top-corruption.patch
# Non-glibc patches
# PATCH-FIX-OPENSUSE Remove debianisms from manpages
@ -469,6 +475,10 @@ rm nscd/s-stamp
%patch1003 -p1
%patch1004 -p1
%patch1005 -p1
%patch1006 -p1
%patch1007 -p1
%patch1008 -p1
%patch1009 -p1
%patch2000 -p1
%patch2002 -p1
@ -477,7 +487,6 @@ rm nscd/s-stamp
%patch2006 -p1
%patch2007 -p1
%patch2008 -p1
%patch2009 -p1
%patch3000
@ -917,8 +926,8 @@ touch %{buildroot}/run/nscd/{socket,nscd.pid}
# Create ld.so.conf
#
cat > %{buildroot}/etc/ld.so.conf <<EOF
%if "%{_lib}" == "lib64"
/usr/local/lib64
%if "%{_lib}" != "lib"
/usr/local/%{_lib}
%endif
%ifarch ppc
/usr/local/lib64

View File

@ -1,3 +1,18 @@
-------------------------------------------------------------------
Tue Jun 9 08:16:46 UTC 2015 - schwab@suse.de
- Add /usr/include/gnu/lib-names-.*.h to baselibs
- pthread-join-deadlock.patch: Don't require rtld lock to store static TLS
offset in the DTV (bsc#930015, BZ #18457)
- heap-top-corruption.patch: Do not corrupt the top of a threaded heap if
top chunk is MINSIZE (BZ #18502)
-------------------------------------------------------------------
Wed Apr 8 12:50:39 UTC 2015 - mgorman@suse.com
- threaded-trim-threshold.patch: Fix regression in threaded application
malloc performance (bsc#915955, BZ #17195)
-------------------------------------------------------------------
Thu Apr 2 08:11:20 UTC 2015 - schwab@suse.de

View File

@ -243,6 +243,14 @@ Patch1003: pthread-mutexattr-gettype-kind.patch
Patch1004: powerpc-software-sqrt.patch
# PATCH-FIX-UPSTREAM Fix DTV race, assert, DTV_SURPLUS Static TLS limit, and nptl_db garbage (bsc#919678, BZ #17090, BZ #17620, BZ #17621, BZ #17628)
Patch1005: static-tls-dtv-limit.patch
# PATCH-FIX-UPSTREAM Fix regression in threaded application malloc performance (bsc#915955, BZ #17195)
Patch1006: threaded-trim-threshold.patch
# PATCH-FIX-UPSTREAM Simplify handling of nameserver configuration in resolver
Patch1007: resolv-nameserver-handling.patch
# PATCH-FIX-UPSTREAM Separate internal state between getXXent and getXXbyYY NSS calls (bsc#918187, BZ #18007)
Patch1008: nss-separate-state-getXXent.patch
# PATCH-FIX-UPSTREAM aarch64: Increase MINSIGSTKSZ and SIGSTKSZ (BZ #16850)
Patch1009: aarch64-sigstksz.patch
###
# Patches awaiting upstream approval
@ -257,12 +265,10 @@ Patch2003: abort-no-flush.patch
Patch2005: glibc-memset-nontemporal.diff
# PATCH-FIX-UPSTREAM Avoid redundant shift character in iconv output at block boundary (BZ #17197)
Patch2006: ibm93x-redundant-shift-si.patch
# PATCH-FIX-UPSTREAM Rewrite handling of nameserver configuration in resolver
Patch2007: resolv-nameserver-handling.patch
# PATCH-FIX-UPSTREAM Separate internal state between getXXent and getXXbyYY NSS calls (bsc#918187, BZ #18007)
Patch2008: nss-separate-state-getXXent.patch
# PATCH-FIX-UPSTREAM aarch64: Increase MINSIGSTKSZ and SIGSTKSZ (BZ #16850)
Patch2009: aarch64-sigstksz.patch
# PATCH-FIX-UPSTREAM Don't require rtld lock to store static TLS offset in the DTV (BZ #18457)
Patch2007: pthread-join-deadlock.patch
# PATCH-FIX-UPSTREAM malloc: Do not corrupt the top of a threaded heap if top chunk is MINSIZE (BZ #18502)
Patch2008: heap-top-corruption.patch
# Non-glibc patches
# PATCH-FIX-OPENSUSE Remove debianisms from manpages
@ -469,6 +475,10 @@ rm nscd/s-stamp
%patch1003 -p1
%patch1004 -p1
%patch1005 -p1
%patch1006 -p1
%patch1007 -p1
%patch1008 -p1
%patch1009 -p1
%patch2000 -p1
%patch2002 -p1
@ -477,7 +487,6 @@ rm nscd/s-stamp
%patch2006 -p1
%patch2007 -p1
%patch2008 -p1
%patch2009 -p1
%patch3000
@ -917,8 +926,8 @@ touch %{buildroot}/run/nscd/{socket,nscd.pid}
# Create ld.so.conf
#
cat > %{buildroot}/etc/ld.so.conf <<EOF
%if "%{_lib}" == "lib64"
/usr/local/lib64
%if "%{_lib}" != "lib"
/usr/local/%{_lib}
%endif
%ifarch ppc
/usr/local/lib64

51
heap-top-corruption.patch Normal file
View File

@ -0,0 +1,51 @@
From: Mel Gorman <mgorman@suse.de>
Subject: [PATCH] [v3] malloc: Do not corrupt the top of a threaded heap if
top chunk is MINSIZE [BZ #18502]
Date: Mon, 8 Jun 2015 13:36:13 +0100
mksquashfs was reported in openSUSE to be causing segmentation faults when
creating installation images. Testing showed that mksquashfs sometimes
failed and could be reproduced within 10 attempts. The core dump looked
like the heap top was corrupted and was pointing to an unmapped area. In
other cases, this has been due to an application corrupting glibc structures
but mksquashfs appears to be fine in this regard.
The problem is that heap_trim is "growing" the top into unmapped space.
If the top chunk == MINSIZE then top_area is -1 and this check does not
behave as expected due to a signed/unsigned comparison
if (top_area <= pad)
return 0;
The next calculation extra = ALIGN_DOWN(top_area - pad, pagesz) calculates
extra as a negative number which also is unnoticed due to a signed/unsigned
comparison. We then call shrink_heap(heap, negative_number) which crashes
later. This patch adds a simple check against MINSIZE to make sure extra
does not become negative. It adds a cast to hint to the reader that this
is a signed vs unsigned issue.
Without the patch, mksquash fails within 10 attempts. With it applied, it
completed 1000 times without error. The standard test suite "make check"
showed no changes in the summary of test results.
2015-06-08 Mel Gorman <mgorman@suse.de>
[BZ #18502]
* malloc/arena.c: Avoid corruption of the top of heaps for threads
---
malloc/arena.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
Index: glibc-2.21/malloc/arena.c
===================================================================
--- glibc-2.21.orig/malloc/arena.c
+++ glibc-2.21/malloc/arena.c
@@ -699,7 +699,7 @@ heap_trim (heap_info *heap, size_t pad)
by preserving the top pad and at least a page. */
top_size = chunksize (top_chunk);
top_area = top_size - MINSIZE - 1;
- if (top_area <= pad)
+ if (top_area < 0 || (size_t) top_area <= pad)
return 0;
extra = ALIGN_DOWN(top_area - pad, pagesz);

152
pthread-join-deadlock.patch Normal file
View File

@ -0,0 +1,152 @@
[PR dynamic-link/18457]
* elf/dl-tls.c (tls_get_addr_tail): Don't take the rtld lock
if we already have a final static TLS offset.
* nptl/tst-join7.c, nptl/tst-join7mod.c: New.
Index: glibc-2.21/elf/dl-tls.c
===================================================================
--- glibc-2.21.orig/elf/dl-tls.c
+++ glibc-2.21/elf/dl-tls.c
@@ -755,30 +755,44 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t
the_map = listp->slotinfo[idx].map;
}
- /* Make sure that, if a dlopen running in parallel forces the
- variable into static storage, we'll wait until the address in the
- static TLS block is set up, and use that. If we're undecided
- yet, make sure we make the decision holding the lock as well. */
- if (__glibc_unlikely (the_map->l_tls_offset
- != FORCED_DYNAMIC_TLS_OFFSET))
+ /* If the TLS block for the map is already assigned to dynamic or to
+ static TLS, avoid the lock. Be careful to use the same value for
+ both tests; if we reloaded it, the second test might mistake
+ forced dynamic for an offset. Now, if the decision hasn't been
+ made, take the rtld lock, so that an ongoing dlopen gets a chance
+ to complete, and then retest; if the decision is still pending,
+ force the module to dynamic TLS. */
+ ptrdiff_t offset = atomic_load_relaxed (&the_map->l_tls_offset);
+ if (__glibc_unlikely (offset != FORCED_DYNAMIC_TLS_OFFSET))
{
+ if (__glibc_unlikely (offset != NO_TLS_OFFSET))
+ goto static_tls;
__rtld_lock_lock_recursive (GL(dl_load_lock));
- if (__glibc_likely (the_map->l_tls_offset == NO_TLS_OFFSET))
+ offset = the_map->l_tls_offset;
+ if (__glibc_likely (offset == NO_TLS_OFFSET))
{
the_map->l_tls_offset = FORCED_DYNAMIC_TLS_OFFSET;
__rtld_lock_unlock_recursive (GL(dl_load_lock));
}
- else if (__glibc_likely (the_map->l_tls_offset
- != FORCED_DYNAMIC_TLS_OFFSET))
+ else if (__glibc_likely (offset != FORCED_DYNAMIC_TLS_OFFSET))
{
+ /* The decision is made, and it is final. We use the value
+ we've already loaded, but we could even load the offset
+ after releasing the lock, since it won't change. Should
+ the module be released while another thread references
+ one of its TLS variables, that's undefined behavior. */
+ __rtld_lock_unlock_recursive (GL(dl_load_lock));
+
+ static_tls:
+ ;
+
#if TLS_TCB_AT_TP
- void *p = (char *) THREAD_SELF - the_map->l_tls_offset;
+ void *p = (char *) THREAD_SELF - offset;
#elif TLS_DTV_AT_TP
- void *p = (char *) THREAD_SELF + the_map->l_tls_offset + TLS_PRE_TCB_SIZE;
+ void *p = (char *) THREAD_SELF + offset + TLS_PRE_TCB_SIZE;
#else
# error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
#endif
- __rtld_lock_unlock_recursive (GL(dl_load_lock));
dtv[GET_ADDR_MODULE].pointer.is_static = true;
dtv[GET_ADDR_MODULE].pointer.val = p;
Index: glibc-2.21/nptl/Makefile
===================================================================
--- glibc-2.21.orig/nptl/Makefile
+++ glibc-2.21/nptl/Makefile
@@ -234,7 +234,7 @@ tests = tst-typesizes \
tst-basic7 \
tst-kill1 tst-kill2 tst-kill3 tst-kill4 tst-kill5 tst-kill6 \
tst-raise1 \
- tst-join1 tst-join2 tst-join3 tst-join4 tst-join5 tst-join6 \
+ tst-join1 tst-join2 tst-join3 tst-join4 tst-join5 tst-join6 tst-join7 \
tst-detach1 \
tst-eintr1 tst-eintr2 tst-eintr3 tst-eintr4 tst-eintr5 \
tst-tsd1 tst-tsd2 tst-tsd3 tst-tsd4 tst-tsd5 tst-tsd6 \
@@ -312,7 +312,8 @@ endif
modules-names = tst-atfork2mod tst-tls3mod tst-tls4moda tst-tls4modb \
tst-tls5mod tst-tls5moda tst-tls5modb tst-tls5modc \
tst-tls5modd tst-tls5mode tst-tls5modf tst-stack4mod \
- tst-_res1mod1 tst-_res1mod2 tst-execstack-mod tst-fini1mod
+ tst-_res1mod1 tst-_res1mod2 tst-execstack-mod tst-fini1mod \
+ tst-join7mod
extra-test-objs += $(addsuffix .os,$(strip $(modules-names))) tst-cleanup4aux.o
test-extras += $(modules-names) tst-cleanup4aux
test-modules = $(addprefix $(objpfx),$(addsuffix .so,$(modules-names)))
@@ -517,6 +518,11 @@ $(objpfx)tst-tls6.out: tst-tls6.sh $(obj
$(evaluate-test)
endif
+$(objpfx)tst-join7: $(libdl) $(shared-thread-library)
+$(objpfx)tst-join7.out: $(objpfx)tst-join7mod.so
+$(objpfx)tst-join7mod.so: $(shared-thread-library)
+LDFLAGS-tst-join7mod.so = -Wl,-soname,tst-join7mod.so
+
$(objpfx)tst-dlsym1: $(libdl) $(shared-thread-library)
$(objpfx)tst-fini1: $(shared-thread-library) $(objpfx)tst-fini1mod.so
Index: glibc-2.21/nptl/tst-join7.c
===================================================================
--- /dev/null
+++ glibc-2.21/nptl/tst-join7.c
@@ -0,0 +1,12 @@
+#include <dlfcn.h>
+
+int
+do_test (void)
+{
+ void *f = dlopen ("tst-join7mod.so", RTLD_NOW | RTLD_GLOBAL);
+ if (f) dlclose (f); else return 1;
+ return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
Index: glibc-2.21/nptl/tst-join7mod.c
===================================================================
--- /dev/null
+++ glibc-2.21/nptl/tst-join7mod.c
@@ -0,0 +1,29 @@
+#include <stdio.h>
+#include <pthread.h>
+
+static pthread_t th;
+static int running = 1;
+
+static void *
+test_run (void *p)
+{
+ while (running)
+ fprintf (stderr, "XXX test_run\n");
+ fprintf (stderr, "XXX test_run FINISHED\n");
+ return NULL;
+}
+
+static void __attribute__ ((constructor))
+do_init (void)
+{
+ pthread_create (&th, NULL, test_run, NULL);
+}
+
+static void __attribute__ ((destructor))
+do_end (void)
+{
+ running = 0;
+ fprintf (stderr, "thread_join...\n");
+ pthread_join (th, NULL);
+ fprintf (stderr, "thread_join DONE\n");
+}

View File

@ -0,0 +1,224 @@
From c26efef9798914e208329c0e8c3c73bb1135d9e3 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 2 Apr 2015 12:14:14 +0530
Subject: [PATCH] malloc: Consistently apply trim_threshold to all heaps [BZ
#17195]
Trimming heaps is a balance between saving memory and the system overhead
required to update page tables and discard allocated pages. The malloc
option M_TRIM_THRESHOLD is a tunable that users are meant to use to decide
where this balance point is but it is only applied to the main arena.
For scalability reasons, glibc malloc has per-thread heaps but these are
shrunk with madvise() if there is one page free at the top of the heap.
In some circumstances this can lead to high system overhead if a thread
has a control flow like
while (data_to_process) {
buf = malloc(large_size);
do_stuff();
free(buf);
}
For a large size, the free() will call madvise (pagetable teardown, page
free and TLB flush) every time followed immediately by a malloc (fault,
kernel page alloc, zeroing and charge accounting). The kernel overhead
can dominate such a workload.
This patch allows the user to tune when madvise gets called by applying
the trim threshold to the per-thread heaps and using similar logic to the
main arena when deciding whether to shrink. Alternatively if the dynamic
brk/mmap threshold gets adjusted then the new values will be obeyed by
the per-thread heaps.
Bug 17195 was a test case motivated by a problem encountered in scientific
applications written in python that performance badly due to high page fault
overhead. The basic operation of such a program was posted by Julian Taylor
https://sourceware.org/ml/libc-alpha/2015-02/msg00373.html
With this patch applied, the overhead is eliminated. All numbers in this
report are in seconds and were recorded by running Julian's program 30
times.
pyarray
glibc madvise
2.21 v2
System min 1.81 ( 0.00%) 0.00 (100.00%)
System mean 1.93 ( 0.00%) 0.02 ( 99.20%)
System stddev 0.06 ( 0.00%) 0.01 ( 88.99%)
System max 2.06 ( 0.00%) 0.03 ( 98.54%)
Elapsed min 3.26 ( 0.00%) 2.37 ( 27.30%)
Elapsed mean 3.39 ( 0.00%) 2.41 ( 28.84%)
Elapsed stddev 0.14 ( 0.00%) 0.02 ( 82.73%)
Elapsed max 4.05 ( 0.00%) 2.47 ( 39.01%)
glibc madvise
2.21 v2
User 141.86 142.28
System 57.94 0.60
Elapsed 102.02 72.66
Note that almost a minutes worth of system time is eliminted and the
program completes 28% faster on average.
To illustrate the problem without python this is a basic test-case for
the worst case scenario where every free is a madvise followed by a an alloc
/* gcc bench-free.c -lpthread -o bench-free */
static int num = 1024;
void __attribute__((noinline,noclone)) dostuff (void *p)
{
}
void *worker (void *data)
{
int i;
for (i = num; i--;)
{
void *m = malloc (48*4096);
dostuff (m);
free (m);
}
return NULL;
}
int main()
{
int i;
pthread_t t;
void *ret;
if (pthread_create (&t, NULL, worker, NULL))
exit (2);
if (pthread_join (t, &ret))
exit (3);
return 0;
}
Before the patch, this resulted in 1024 calls to madvise. With the patch applied,
madvise is called twice because the default trim threshold is high enough to avoid
this.
This a more complex case where there is a mix of frees. It's simply a different worker
function for the test case above
void *worker (void *data)
{
int i;
int j = 0;
void *free_index[num];
for (i = num; i--;)
{
void *m = malloc ((i % 58) *4096);
dostuff (m);
if (i % 2 == 0) {
free (m);
} else {
free_index[j++] = m;
}
}
for (; j >= 0; j--)
{
free(free_index[j]);
}
return NULL;
}
glibc 2.21 calls malloc 90305 times but with the patch applied, it's
called 13438. Increasing the trim threshold will decrease the number of
times it's called with the option of eliminating the overhead.
ebizzy is meant to generate a workload resembling common web application
server workloads. It is threaded with a large working set that at its core
has an allocation, do_stuff, free loop that also hits this case. The primary
metric of the benchmark is records processed per second. This is running on
my desktop which is a single socket machine with an I7-4770 and 8 cores.
Each thread count was run for 30 seconds. It was only run once as the
performance difference is so high that the variation is insignificant.
glibc 2.21 patch
threads 1 10230 44114
threads 2 19153 84925
threads 4 34295 134569
threads 8 51007 183387
Note that the saving happens to be a concidence as the size allocated
by ebizzy was less than the default threshold. If a different number of
chunks were specified then it may also be necessary to tune the threshold
to compensate
This is roughly quadrupling the performance of this benchmark. The difference in
system CPU usage illustrates why.
ebizzy running 1 thread with glibc 2.21
10230 records/s 306904
real 30.00 s
user 7.47 s
sys 22.49 s
22.49 seconds was spent in the kernel for a workload runinng 30 seconds. With the
patch applied
ebizzy running 1 thread with patch applied
44126 records/s 1323792
real 30.00 s
user 29.97 s
sys 0.00 s
system CPU usage was zero with the patch applied. strace shows that glibc
running this workload calls madvise approximately 9000 times a second. With
the patch applied madvise was called twice during the workload (or 0.06
times per second).
2015-02-10 Mel Gorman <mgorman@suse.de>
[BZ #17195]
* malloc/arena.c (free): Apply trim threshold to per-thread heaps
as well as the main arena.
---
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -241,6 +241,8 @@
/* For MIN, MAX, powerof2. */
#include <sys/param.h>
+/* For ALIGN_DOWN. */
+#include <libc-internal.h>
/*
Debugging:
--- a/malloc/arena.c
+++ b/malloc/arena.c
@@ -658,7 +658,7 @@ heap_trim (heap_info *heap, size_t pad)
unsigned long pagesz = GLRO (dl_pagesize);
mchunkptr top_chunk = top (ar_ptr), p, bck, fwd;
heap_info *prev_heap;
- long new_size, top_size, extra, prev_size, misalign;
+ long new_size, top_size, top_area, extra, prev_size, misalign;
/* Can this heap go away completely? */
while (top_chunk == chunk_at_offset (heap, sizeof (*heap)))
@@ -694,9 +694,16 @@ heap_trim (heap_info *heap, size_t pad)
set_head (top_chunk, new_size | PREV_INUSE);
/*check_chunk(ar_ptr, top_chunk);*/
}
+
+ /* Uses similar logic for per-thread arenas as the main arena with systrim
+ by preserving the top pad and at least a page. */
top_size = chunksize (top_chunk);
- extra = (top_size - pad - MINSIZE - 1) & ~(pagesz - 1);
- if (extra < (long) pagesz)
+ top_area = top_size - MINSIZE - 1;
+ if (top_area <= pad)
+ return 0;
+
+ extra = ALIGN_DOWN(top_area - pad, pagesz);
+ if ((unsigned long) extra < mp_.trim_threshold)
return 0;
/* Try to shrink. */