diff --git a/baselibs.conf b/baselibs.conf index 42676f4..b89df19 100644 --- a/baselibs.conf +++ b/baselibs.conf @@ -14,6 +14,7 @@ glibc-locale glibc-devel requires "glibc- = %version" arch i586 block! + +^/usr/include/gnu/lib-names-.*\.h$ +^/usr/include/gnu/stubs-.*\.h$ glibc-devel-static arch i586 block! diff --git a/glibc-testsuite.changes b/glibc-testsuite.changes index 286bb78..efe1565 100644 --- a/glibc-testsuite.changes +++ b/glibc-testsuite.changes @@ -1,3 +1,18 @@ +------------------------------------------------------------------- +Tue Jun 9 08:16:46 UTC 2015 - schwab@suse.de + +- Add /usr/include/gnu/lib-names-.*.h to baselibs +- pthread-join-deadlock.patch: Don't require rtld lock to store static TLS + offset in the DTV (bsc#930015, BZ #18457) +- heap-top-corruption.patch: Do not corrupt the top of a threaded heap if + top chunk is MINSIZE (BZ #18502) + +------------------------------------------------------------------- +Wed Apr 8 12:50:39 UTC 2015 - mgorman@suse.com + +- threaded-trim-threshold.patch: Fix regression in threaded application + malloc performance (bsc#915955, BZ #17195) + ------------------------------------------------------------------- Thu Apr 2 08:11:20 UTC 2015 - schwab@suse.de diff --git a/glibc-testsuite.spec b/glibc-testsuite.spec index b6ba37f..8ac1b89 100644 --- a/glibc-testsuite.spec +++ b/glibc-testsuite.spec @@ -243,6 +243,14 @@ Patch1003: pthread-mutexattr-gettype-kind.patch Patch1004: powerpc-software-sqrt.patch # PATCH-FIX-UPSTREAM Fix DTV race, assert, DTV_SURPLUS Static TLS limit, and nptl_db garbage (bsc#919678, BZ #17090, BZ #17620, BZ #17621, BZ #17628) Patch1005: static-tls-dtv-limit.patch +# PATCH-FIX-UPSTREAM Fix regression in threaded application malloc performance (bsc#915955, BZ #17195) +Patch1006: threaded-trim-threshold.patch +# PATCH-FIX-UPSTREAM Simplify handling of nameserver configuration in resolver +Patch1007: resolv-nameserver-handling.patch +# PATCH-FIX-UPSTREAM Separate internal state between getXXent and getXXbyYY NSS calls (bsc#918187, BZ #18007) +Patch1008: nss-separate-state-getXXent.patch +# PATCH-FIX-UPSTREAM aarch64: Increase MINSIGSTKSZ and SIGSTKSZ (BZ #16850) +Patch1009: aarch64-sigstksz.patch ### # Patches awaiting upstream approval @@ -257,12 +265,10 @@ Patch2003: abort-no-flush.patch Patch2005: glibc-memset-nontemporal.diff # PATCH-FIX-UPSTREAM Avoid redundant shift character in iconv output at block boundary (BZ #17197) Patch2006: ibm93x-redundant-shift-si.patch -# PATCH-FIX-UPSTREAM Rewrite handling of nameserver configuration in resolver -Patch2007: resolv-nameserver-handling.patch -# PATCH-FIX-UPSTREAM Separate internal state between getXXent and getXXbyYY NSS calls (bsc#918187, BZ #18007) -Patch2008: nss-separate-state-getXXent.patch -# PATCH-FIX-UPSTREAM aarch64: Increase MINSIGSTKSZ and SIGSTKSZ (BZ #16850) -Patch2009: aarch64-sigstksz.patch +# PATCH-FIX-UPSTREAM Don't require rtld lock to store static TLS offset in the DTV (BZ #18457) +Patch2007: pthread-join-deadlock.patch +# PATCH-FIX-UPSTREAM malloc: Do not corrupt the top of a threaded heap if top chunk is MINSIZE (BZ #18502) +Patch2008: heap-top-corruption.patch # Non-glibc patches # PATCH-FIX-OPENSUSE Remove debianisms from manpages @@ -469,6 +475,10 @@ rm nscd/s-stamp %patch1003 -p1 %patch1004 -p1 %patch1005 -p1 +%patch1006 -p1 +%patch1007 -p1 +%patch1008 -p1 +%patch1009 -p1 %patch2000 -p1 %patch2002 -p1 @@ -477,7 +487,6 @@ rm nscd/s-stamp %patch2006 -p1 %patch2007 -p1 %patch2008 -p1 -%patch2009 -p1 %patch3000 @@ -917,8 +926,8 @@ touch %{buildroot}/run/nscd/{socket,nscd.pid} # Create ld.so.conf # cat > %{buildroot}/etc/ld.so.conf < %{buildroot}/etc/ld.so.conf < %{buildroot}/etc/ld.so.conf < +Subject: [PATCH] [v3] malloc: Do not corrupt the top of a threaded heap if + top chunk is MINSIZE [BZ #18502] +Date: Mon, 8 Jun 2015 13:36:13 +0100 + +mksquashfs was reported in openSUSE to be causing segmentation faults when +creating installation images. Testing showed that mksquashfs sometimes +failed and could be reproduced within 10 attempts. The core dump looked +like the heap top was corrupted and was pointing to an unmapped area. In +other cases, this has been due to an application corrupting glibc structures +but mksquashfs appears to be fine in this regard. + +The problem is that heap_trim is "growing" the top into unmapped space. +If the top chunk == MINSIZE then top_area is -1 and this check does not +behave as expected due to a signed/unsigned comparison + + if (top_area <= pad) + return 0; + +The next calculation extra = ALIGN_DOWN(top_area - pad, pagesz) calculates +extra as a negative number which also is unnoticed due to a signed/unsigned +comparison. We then call shrink_heap(heap, negative_number) which crashes +later. This patch adds a simple check against MINSIZE to make sure extra +does not become negative. It adds a cast to hint to the reader that this +is a signed vs unsigned issue. + +Without the patch, mksquash fails within 10 attempts. With it applied, it +completed 1000 times without error. The standard test suite "make check" +showed no changes in the summary of test results. + +2015-06-08 Mel Gorman + + [BZ #18502] + * malloc/arena.c: Avoid corruption of the top of heaps for threads +--- + malloc/arena.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +Index: glibc-2.21/malloc/arena.c +=================================================================== +--- glibc-2.21.orig/malloc/arena.c ++++ glibc-2.21/malloc/arena.c +@@ -699,7 +699,7 @@ heap_trim (heap_info *heap, size_t pad) + by preserving the top pad and at least a page. */ + top_size = chunksize (top_chunk); + top_area = top_size - MINSIZE - 1; +- if (top_area <= pad) ++ if (top_area < 0 || (size_t) top_area <= pad) + return 0; + + extra = ALIGN_DOWN(top_area - pad, pagesz); diff --git a/pthread-join-deadlock.patch b/pthread-join-deadlock.patch new file mode 100644 index 0000000..34c60d6 --- /dev/null +++ b/pthread-join-deadlock.patch @@ -0,0 +1,152 @@ + [PR dynamic-link/18457] + * elf/dl-tls.c (tls_get_addr_tail): Don't take the rtld lock + if we already have a final static TLS offset. + * nptl/tst-join7.c, nptl/tst-join7mod.c: New. + +Index: glibc-2.21/elf/dl-tls.c +=================================================================== +--- glibc-2.21.orig/elf/dl-tls.c ++++ glibc-2.21/elf/dl-tls.c +@@ -755,30 +755,44 @@ tls_get_addr_tail (GET_ADDR_ARGS, dtv_t + the_map = listp->slotinfo[idx].map; + } + +- /* Make sure that, if a dlopen running in parallel forces the +- variable into static storage, we'll wait until the address in the +- static TLS block is set up, and use that. If we're undecided +- yet, make sure we make the decision holding the lock as well. */ +- if (__glibc_unlikely (the_map->l_tls_offset +- != FORCED_DYNAMIC_TLS_OFFSET)) ++ /* If the TLS block for the map is already assigned to dynamic or to ++ static TLS, avoid the lock. Be careful to use the same value for ++ both tests; if we reloaded it, the second test might mistake ++ forced dynamic for an offset. Now, if the decision hasn't been ++ made, take the rtld lock, so that an ongoing dlopen gets a chance ++ to complete, and then retest; if the decision is still pending, ++ force the module to dynamic TLS. */ ++ ptrdiff_t offset = atomic_load_relaxed (&the_map->l_tls_offset); ++ if (__glibc_unlikely (offset != FORCED_DYNAMIC_TLS_OFFSET)) + { ++ if (__glibc_unlikely (offset != NO_TLS_OFFSET)) ++ goto static_tls; + __rtld_lock_lock_recursive (GL(dl_load_lock)); +- if (__glibc_likely (the_map->l_tls_offset == NO_TLS_OFFSET)) ++ offset = the_map->l_tls_offset; ++ if (__glibc_likely (offset == NO_TLS_OFFSET)) + { + the_map->l_tls_offset = FORCED_DYNAMIC_TLS_OFFSET; + __rtld_lock_unlock_recursive (GL(dl_load_lock)); + } +- else if (__glibc_likely (the_map->l_tls_offset +- != FORCED_DYNAMIC_TLS_OFFSET)) ++ else if (__glibc_likely (offset != FORCED_DYNAMIC_TLS_OFFSET)) + { ++ /* The decision is made, and it is final. We use the value ++ we've already loaded, but we could even load the offset ++ after releasing the lock, since it won't change. Should ++ the module be released while another thread references ++ one of its TLS variables, that's undefined behavior. */ ++ __rtld_lock_unlock_recursive (GL(dl_load_lock)); ++ ++ static_tls: ++ ; ++ + #if TLS_TCB_AT_TP +- void *p = (char *) THREAD_SELF - the_map->l_tls_offset; ++ void *p = (char *) THREAD_SELF - offset; + #elif TLS_DTV_AT_TP +- void *p = (char *) THREAD_SELF + the_map->l_tls_offset + TLS_PRE_TCB_SIZE; ++ void *p = (char *) THREAD_SELF + offset + TLS_PRE_TCB_SIZE; + #else + # error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined" + #endif +- __rtld_lock_unlock_recursive (GL(dl_load_lock)); + + dtv[GET_ADDR_MODULE].pointer.is_static = true; + dtv[GET_ADDR_MODULE].pointer.val = p; +Index: glibc-2.21/nptl/Makefile +=================================================================== +--- glibc-2.21.orig/nptl/Makefile ++++ glibc-2.21/nptl/Makefile +@@ -234,7 +234,7 @@ tests = tst-typesizes \ + tst-basic7 \ + tst-kill1 tst-kill2 tst-kill3 tst-kill4 tst-kill5 tst-kill6 \ + tst-raise1 \ +- tst-join1 tst-join2 tst-join3 tst-join4 tst-join5 tst-join6 \ ++ tst-join1 tst-join2 tst-join3 tst-join4 tst-join5 tst-join6 tst-join7 \ + tst-detach1 \ + tst-eintr1 tst-eintr2 tst-eintr3 tst-eintr4 tst-eintr5 \ + tst-tsd1 tst-tsd2 tst-tsd3 tst-tsd4 tst-tsd5 tst-tsd6 \ +@@ -312,7 +312,8 @@ endif + modules-names = tst-atfork2mod tst-tls3mod tst-tls4moda tst-tls4modb \ + tst-tls5mod tst-tls5moda tst-tls5modb tst-tls5modc \ + tst-tls5modd tst-tls5mode tst-tls5modf tst-stack4mod \ +- tst-_res1mod1 tst-_res1mod2 tst-execstack-mod tst-fini1mod ++ tst-_res1mod1 tst-_res1mod2 tst-execstack-mod tst-fini1mod \ ++ tst-join7mod + extra-test-objs += $(addsuffix .os,$(strip $(modules-names))) tst-cleanup4aux.o + test-extras += $(modules-names) tst-cleanup4aux + test-modules = $(addprefix $(objpfx),$(addsuffix .so,$(modules-names))) +@@ -517,6 +518,11 @@ $(objpfx)tst-tls6.out: tst-tls6.sh $(obj + $(evaluate-test) + endif + ++$(objpfx)tst-join7: $(libdl) $(shared-thread-library) ++$(objpfx)tst-join7.out: $(objpfx)tst-join7mod.so ++$(objpfx)tst-join7mod.so: $(shared-thread-library) ++LDFLAGS-tst-join7mod.so = -Wl,-soname,tst-join7mod.so ++ + $(objpfx)tst-dlsym1: $(libdl) $(shared-thread-library) + + $(objpfx)tst-fini1: $(shared-thread-library) $(objpfx)tst-fini1mod.so +Index: glibc-2.21/nptl/tst-join7.c +=================================================================== +--- /dev/null ++++ glibc-2.21/nptl/tst-join7.c +@@ -0,0 +1,12 @@ ++#include ++ ++int ++do_test (void) ++{ ++ void *f = dlopen ("tst-join7mod.so", RTLD_NOW | RTLD_GLOBAL); ++ if (f) dlclose (f); else return 1; ++ return 0; ++} ++ ++#define TEST_FUNCTION do_test () ++#include "../test-skeleton.c" +Index: glibc-2.21/nptl/tst-join7mod.c +=================================================================== +--- /dev/null ++++ glibc-2.21/nptl/tst-join7mod.c +@@ -0,0 +1,29 @@ ++#include ++#include ++ ++static pthread_t th; ++static int running = 1; ++ ++static void * ++test_run (void *p) ++{ ++ while (running) ++ fprintf (stderr, "XXX test_run\n"); ++ fprintf (stderr, "XXX test_run FINISHED\n"); ++ return NULL; ++} ++ ++static void __attribute__ ((constructor)) ++do_init (void) ++{ ++ pthread_create (&th, NULL, test_run, NULL); ++} ++ ++static void __attribute__ ((destructor)) ++do_end (void) ++{ ++ running = 0; ++ fprintf (stderr, "thread_join...\n"); ++ pthread_join (th, NULL); ++ fprintf (stderr, "thread_join DONE\n"); ++} diff --git a/threaded-trim-threshold.patch b/threaded-trim-threshold.patch new file mode 100644 index 0000000..4457442 --- /dev/null +++ b/threaded-trim-threshold.patch @@ -0,0 +1,224 @@ +From c26efef9798914e208329c0e8c3c73bb1135d9e3 Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Thu, 2 Apr 2015 12:14:14 +0530 +Subject: [PATCH] malloc: Consistently apply trim_threshold to all heaps [BZ + #17195] + +Trimming heaps is a balance between saving memory and the system overhead +required to update page tables and discard allocated pages. The malloc +option M_TRIM_THRESHOLD is a tunable that users are meant to use to decide +where this balance point is but it is only applied to the main arena. + +For scalability reasons, glibc malloc has per-thread heaps but these are +shrunk with madvise() if there is one page free at the top of the heap. +In some circumstances this can lead to high system overhead if a thread +has a control flow like + + while (data_to_process) { + buf = malloc(large_size); + do_stuff(); + free(buf); + } + +For a large size, the free() will call madvise (pagetable teardown, page +free and TLB flush) every time followed immediately by a malloc (fault, +kernel page alloc, zeroing and charge accounting). The kernel overhead +can dominate such a workload. + +This patch allows the user to tune when madvise gets called by applying +the trim threshold to the per-thread heaps and using similar logic to the +main arena when deciding whether to shrink. Alternatively if the dynamic +brk/mmap threshold gets adjusted then the new values will be obeyed by +the per-thread heaps. + +Bug 17195 was a test case motivated by a problem encountered in scientific +applications written in python that performance badly due to high page fault +overhead. The basic operation of such a program was posted by Julian Taylor +https://sourceware.org/ml/libc-alpha/2015-02/msg00373.html + +With this patch applied, the overhead is eliminated. All numbers in this +report are in seconds and were recorded by running Julian's program 30 +times. + +pyarray + glibc madvise + 2.21 v2 +System min 1.81 ( 0.00%) 0.00 (100.00%) +System mean 1.93 ( 0.00%) 0.02 ( 99.20%) +System stddev 0.06 ( 0.00%) 0.01 ( 88.99%) +System max 2.06 ( 0.00%) 0.03 ( 98.54%) +Elapsed min 3.26 ( 0.00%) 2.37 ( 27.30%) +Elapsed mean 3.39 ( 0.00%) 2.41 ( 28.84%) +Elapsed stddev 0.14 ( 0.00%) 0.02 ( 82.73%) +Elapsed max 4.05 ( 0.00%) 2.47 ( 39.01%) + + glibc madvise + 2.21 v2 +User 141.86 142.28 +System 57.94 0.60 +Elapsed 102.02 72.66 + +Note that almost a minutes worth of system time is eliminted and the +program completes 28% faster on average. + +To illustrate the problem without python this is a basic test-case for +the worst case scenario where every free is a madvise followed by a an alloc + +/* gcc bench-free.c -lpthread -o bench-free */ +static int num = 1024; + +void __attribute__((noinline,noclone)) dostuff (void *p) +{ +} + +void *worker (void *data) +{ + int i; + + for (i = num; i--;) + { + void *m = malloc (48*4096); + dostuff (m); + free (m); + } + + return NULL; +} + +int main() +{ + int i; + pthread_t t; + void *ret; + if (pthread_create (&t, NULL, worker, NULL)) + exit (2); + if (pthread_join (t, &ret)) + exit (3); + return 0; +} + +Before the patch, this resulted in 1024 calls to madvise. With the patch applied, +madvise is called twice because the default trim threshold is high enough to avoid +this. + +This a more complex case where there is a mix of frees. It's simply a different worker +function for the test case above + +void *worker (void *data) +{ + int i; + int j = 0; + void *free_index[num]; + + for (i = num; i--;) + { + void *m = malloc ((i % 58) *4096); + dostuff (m); + if (i % 2 == 0) { + free (m); + } else { + free_index[j++] = m; + } + } + for (; j >= 0; j--) + { + free(free_index[j]); + } + + return NULL; +} + +glibc 2.21 calls malloc 90305 times but with the patch applied, it's +called 13438. Increasing the trim threshold will decrease the number of +times it's called with the option of eliminating the overhead. + +ebizzy is meant to generate a workload resembling common web application +server workloads. It is threaded with a large working set that at its core +has an allocation, do_stuff, free loop that also hits this case. The primary +metric of the benchmark is records processed per second. This is running on +my desktop which is a single socket machine with an I7-4770 and 8 cores. +Each thread count was run for 30 seconds. It was only run once as the +performance difference is so high that the variation is insignificant. + + glibc 2.21 patch +threads 1 10230 44114 +threads 2 19153 84925 +threads 4 34295 134569 +threads 8 51007 183387 + +Note that the saving happens to be a concidence as the size allocated +by ebizzy was less than the default threshold. If a different number of +chunks were specified then it may also be necessary to tune the threshold +to compensate + +This is roughly quadrupling the performance of this benchmark. The difference in +system CPU usage illustrates why. + +ebizzy running 1 thread with glibc 2.21 +10230 records/s 306904 +real 30.00 s +user 7.47 s +sys 22.49 s + +22.49 seconds was spent in the kernel for a workload runinng 30 seconds. With the +patch applied + +ebizzy running 1 thread with patch applied +44126 records/s 1323792 +real 30.00 s +user 29.97 s +sys 0.00 s + +system CPU usage was zero with the patch applied. strace shows that glibc +running this workload calls madvise approximately 9000 times a second. With +the patch applied madvise was called twice during the workload (or 0.06 +times per second). + +2015-02-10 Mel Gorman + + [BZ #17195] + * malloc/arena.c (free): Apply trim threshold to per-thread heaps + as well as the main arena. +--- + +--- a/malloc/malloc.c ++++ b/malloc/malloc.c +@@ -241,6 +241,8 @@ + /* For MIN, MAX, powerof2. */ + #include + ++/* For ALIGN_DOWN. */ ++#include + + /* + Debugging: +--- a/malloc/arena.c ++++ b/malloc/arena.c +@@ -658,7 +658,7 @@ heap_trim (heap_info *heap, size_t pad) + unsigned long pagesz = GLRO (dl_pagesize); + mchunkptr top_chunk = top (ar_ptr), p, bck, fwd; + heap_info *prev_heap; +- long new_size, top_size, extra, prev_size, misalign; ++ long new_size, top_size, top_area, extra, prev_size, misalign; + + /* Can this heap go away completely? */ + while (top_chunk == chunk_at_offset (heap, sizeof (*heap))) +@@ -694,9 +694,16 @@ heap_trim (heap_info *heap, size_t pad) + set_head (top_chunk, new_size | PREV_INUSE); + /*check_chunk(ar_ptr, top_chunk);*/ + } ++ ++ /* Uses similar logic for per-thread arenas as the main arena with systrim ++ by preserving the top pad and at least a page. */ + top_size = chunksize (top_chunk); +- extra = (top_size - pad - MINSIZE - 1) & ~(pagesz - 1); +- if (extra < (long) pagesz) ++ top_area = top_size - MINSIZE - 1; ++ if (top_area <= pad) ++ return 0; ++ ++ extra = ALIGN_DOWN(top_area - pad, pagesz); ++ if ((unsigned long) extra < mp_.trim_threshold) + return 0; + + /* Try to shrink. */