diff --git a/glibc-testsuite.changes b/glibc-testsuite.changes index 286bb78..28e6771 100644 --- a/glibc-testsuite.changes +++ b/glibc-testsuite.changes @@ -1,3 +1,9 @@ +------------------------------------------------------------------- +Wed Apr 8 12:50:39 UTC 2015 - mgorman@suse.com + +- threaded-trim-threshold.patch: Fix regression in threaded application + malloc performance (bsc#915955, BZ #17195) + ------------------------------------------------------------------- Thu Apr 2 08:11:20 UTC 2015 - schwab@suse.de diff --git a/glibc-testsuite.spec b/glibc-testsuite.spec index b6ba37f..bd58940 100644 --- a/glibc-testsuite.spec +++ b/glibc-testsuite.spec @@ -243,6 +243,8 @@ Patch1003: pthread-mutexattr-gettype-kind.patch Patch1004: powerpc-software-sqrt.patch # PATCH-FIX-UPSTREAM Fix DTV race, assert, DTV_SURPLUS Static TLS limit, and nptl_db garbage (bsc#919678, BZ #17090, BZ #17620, BZ #17621, BZ #17628) Patch1005: static-tls-dtv-limit.patch +# PATCH-FIX-UPSTREAM Fix regression in threaded application malloc performance (bsc#915955, BZ #17195) +Patch1006: threaded-trim-threshold.patch ### # Patches awaiting upstream approval @@ -469,6 +471,7 @@ rm nscd/s-stamp %patch1003 -p1 %patch1004 -p1 %patch1005 -p1 +%patch1006 -p1 %patch2000 -p1 %patch2002 -p1 diff --git a/glibc-utils.changes b/glibc-utils.changes index 286bb78..28e6771 100644 --- a/glibc-utils.changes +++ b/glibc-utils.changes @@ -1,3 +1,9 @@ +------------------------------------------------------------------- +Wed Apr 8 12:50:39 UTC 2015 - mgorman@suse.com + +- threaded-trim-threshold.patch: Fix regression in threaded application + malloc performance (bsc#915955, BZ #17195) + ------------------------------------------------------------------- Thu Apr 2 08:11:20 UTC 2015 - schwab@suse.de diff --git a/glibc-utils.spec b/glibc-utils.spec index 4d14ed7..b5ae5d1 100644 --- a/glibc-utils.spec +++ b/glibc-utils.spec @@ -242,6 +242,8 @@ Patch1003: pthread-mutexattr-gettype-kind.patch Patch1004: powerpc-software-sqrt.patch # PATCH-FIX-UPSTREAM Fix DTV race, assert, DTV_SURPLUS Static TLS limit, and nptl_db garbage (bsc#919678, BZ #17090, BZ #17620, BZ #17621, BZ #17628) Patch1005: static-tls-dtv-limit.patch +# PATCH-FIX-UPSTREAM Fix regression in threaded application malloc performance (bsc#915955, BZ #17195) +Patch1006: threaded-trim-threshold.patch ### # Patches awaiting upstream approval @@ -469,6 +471,7 @@ rm nscd/s-stamp %patch1003 -p1 %patch1004 -p1 %patch1005 -p1 +%patch1006 -p1 %patch2000 -p1 %patch2002 -p1 diff --git a/glibc.changes b/glibc.changes index 286bb78..28e6771 100644 --- a/glibc.changes +++ b/glibc.changes @@ -1,3 +1,9 @@ +------------------------------------------------------------------- +Wed Apr 8 12:50:39 UTC 2015 - mgorman@suse.com + +- threaded-trim-threshold.patch: Fix regression in threaded application + malloc performance (bsc#915955, BZ #17195) + ------------------------------------------------------------------- Thu Apr 2 08:11:20 UTC 2015 - schwab@suse.de diff --git a/glibc.spec b/glibc.spec index f7d1dbf..e10f095 100644 --- a/glibc.spec +++ b/glibc.spec @@ -243,6 +243,8 @@ Patch1003: pthread-mutexattr-gettype-kind.patch Patch1004: powerpc-software-sqrt.patch # PATCH-FIX-UPSTREAM Fix DTV race, assert, DTV_SURPLUS Static TLS limit, and nptl_db garbage (bsc#919678, BZ #17090, BZ #17620, BZ #17621, BZ #17628) Patch1005: static-tls-dtv-limit.patch +# PATCH-FIX-UPSTREAM Fix regression in threaded application malloc performance (bsc#915955, BZ #17195) +Patch1006: threaded-trim-threshold.patch ### # Patches awaiting upstream approval @@ -469,6 +471,7 @@ rm nscd/s-stamp %patch1003 -p1 %patch1004 -p1 %patch1005 -p1 +%patch1006 -p1 %patch2000 -p1 %patch2002 -p1 diff --git a/threaded-trim-threshold.patch b/threaded-trim-threshold.patch new file mode 100644 index 0000000..4457442 --- /dev/null +++ b/threaded-trim-threshold.patch @@ -0,0 +1,224 @@ +From c26efef9798914e208329c0e8c3c73bb1135d9e3 Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Thu, 2 Apr 2015 12:14:14 +0530 +Subject: [PATCH] malloc: Consistently apply trim_threshold to all heaps [BZ + #17195] + +Trimming heaps is a balance between saving memory and the system overhead +required to update page tables and discard allocated pages. The malloc +option M_TRIM_THRESHOLD is a tunable that users are meant to use to decide +where this balance point is but it is only applied to the main arena. + +For scalability reasons, glibc malloc has per-thread heaps but these are +shrunk with madvise() if there is one page free at the top of the heap. +In some circumstances this can lead to high system overhead if a thread +has a control flow like + + while (data_to_process) { + buf = malloc(large_size); + do_stuff(); + free(buf); + } + +For a large size, the free() will call madvise (pagetable teardown, page +free and TLB flush) every time followed immediately by a malloc (fault, +kernel page alloc, zeroing and charge accounting). The kernel overhead +can dominate such a workload. + +This patch allows the user to tune when madvise gets called by applying +the trim threshold to the per-thread heaps and using similar logic to the +main arena when deciding whether to shrink. Alternatively if the dynamic +brk/mmap threshold gets adjusted then the new values will be obeyed by +the per-thread heaps. + +Bug 17195 was a test case motivated by a problem encountered in scientific +applications written in python that performance badly due to high page fault +overhead. The basic operation of such a program was posted by Julian Taylor +https://sourceware.org/ml/libc-alpha/2015-02/msg00373.html + +With this patch applied, the overhead is eliminated. All numbers in this +report are in seconds and were recorded by running Julian's program 30 +times. + +pyarray + glibc madvise + 2.21 v2 +System min 1.81 ( 0.00%) 0.00 (100.00%) +System mean 1.93 ( 0.00%) 0.02 ( 99.20%) +System stddev 0.06 ( 0.00%) 0.01 ( 88.99%) +System max 2.06 ( 0.00%) 0.03 ( 98.54%) +Elapsed min 3.26 ( 0.00%) 2.37 ( 27.30%) +Elapsed mean 3.39 ( 0.00%) 2.41 ( 28.84%) +Elapsed stddev 0.14 ( 0.00%) 0.02 ( 82.73%) +Elapsed max 4.05 ( 0.00%) 2.47 ( 39.01%) + + glibc madvise + 2.21 v2 +User 141.86 142.28 +System 57.94 0.60 +Elapsed 102.02 72.66 + +Note that almost a minutes worth of system time is eliminted and the +program completes 28% faster on average. + +To illustrate the problem without python this is a basic test-case for +the worst case scenario where every free is a madvise followed by a an alloc + +/* gcc bench-free.c -lpthread -o bench-free */ +static int num = 1024; + +void __attribute__((noinline,noclone)) dostuff (void *p) +{ +} + +void *worker (void *data) +{ + int i; + + for (i = num; i--;) + { + void *m = malloc (48*4096); + dostuff (m); + free (m); + } + + return NULL; +} + +int main() +{ + int i; + pthread_t t; + void *ret; + if (pthread_create (&t, NULL, worker, NULL)) + exit (2); + if (pthread_join (t, &ret)) + exit (3); + return 0; +} + +Before the patch, this resulted in 1024 calls to madvise. With the patch applied, +madvise is called twice because the default trim threshold is high enough to avoid +this. + +This a more complex case where there is a mix of frees. It's simply a different worker +function for the test case above + +void *worker (void *data) +{ + int i; + int j = 0; + void *free_index[num]; + + for (i = num; i--;) + { + void *m = malloc ((i % 58) *4096); + dostuff (m); + if (i % 2 == 0) { + free (m); + } else { + free_index[j++] = m; + } + } + for (; j >= 0; j--) + { + free(free_index[j]); + } + + return NULL; +} + +glibc 2.21 calls malloc 90305 times but with the patch applied, it's +called 13438. Increasing the trim threshold will decrease the number of +times it's called with the option of eliminating the overhead. + +ebizzy is meant to generate a workload resembling common web application +server workloads. It is threaded with a large working set that at its core +has an allocation, do_stuff, free loop that also hits this case. The primary +metric of the benchmark is records processed per second. This is running on +my desktop which is a single socket machine with an I7-4770 and 8 cores. +Each thread count was run for 30 seconds. It was only run once as the +performance difference is so high that the variation is insignificant. + + glibc 2.21 patch +threads 1 10230 44114 +threads 2 19153 84925 +threads 4 34295 134569 +threads 8 51007 183387 + +Note that the saving happens to be a concidence as the size allocated +by ebizzy was less than the default threshold. If a different number of +chunks were specified then it may also be necessary to tune the threshold +to compensate + +This is roughly quadrupling the performance of this benchmark. The difference in +system CPU usage illustrates why. + +ebizzy running 1 thread with glibc 2.21 +10230 records/s 306904 +real 30.00 s +user 7.47 s +sys 22.49 s + +22.49 seconds was spent in the kernel for a workload runinng 30 seconds. With the +patch applied + +ebizzy running 1 thread with patch applied +44126 records/s 1323792 +real 30.00 s +user 29.97 s +sys 0.00 s + +system CPU usage was zero with the patch applied. strace shows that glibc +running this workload calls madvise approximately 9000 times a second. With +the patch applied madvise was called twice during the workload (or 0.06 +times per second). + +2015-02-10 Mel Gorman + + [BZ #17195] + * malloc/arena.c (free): Apply trim threshold to per-thread heaps + as well as the main arena. +--- + +--- a/malloc/malloc.c ++++ b/malloc/malloc.c +@@ -241,6 +241,8 @@ + /* For MIN, MAX, powerof2. */ + #include + ++/* For ALIGN_DOWN. */ ++#include + + /* + Debugging: +--- a/malloc/arena.c ++++ b/malloc/arena.c +@@ -658,7 +658,7 @@ heap_trim (heap_info *heap, size_t pad) + unsigned long pagesz = GLRO (dl_pagesize); + mchunkptr top_chunk = top (ar_ptr), p, bck, fwd; + heap_info *prev_heap; +- long new_size, top_size, extra, prev_size, misalign; ++ long new_size, top_size, top_area, extra, prev_size, misalign; + + /* Can this heap go away completely? */ + while (top_chunk == chunk_at_offset (heap, sizeof (*heap))) +@@ -694,9 +694,16 @@ heap_trim (heap_info *heap, size_t pad) + set_head (top_chunk, new_size | PREV_INUSE); + /*check_chunk(ar_ptr, top_chunk);*/ + } ++ ++ /* Uses similar logic for per-thread arenas as the main arena with systrim ++ by preserving the top pad and at least a page. */ + top_size = chunksize (top_chunk); +- extra = (top_size - pad - MINSIZE - 1) & ~(pagesz - 1); +- if (extra < (long) pagesz) ++ top_area = top_size - MINSIZE - 1; ++ if (top_area <= pad) ++ return 0; ++ ++ extra = ALIGN_DOWN(top_area - pad, pagesz); ++ if ((unsigned long) extra < mp_.trim_threshold) + return 0; + + /* Try to shrink. */