forked from pool/makedumpfile
Fix a data race in multi-threading mode (--num-threads=N) #3
117
makedumpfile-fix-multi-threading-data-race.patch
Normal file
117
makedumpfile-fix-multi-threading-data-race.patch
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
From: Tao Liu <ltao@redhat.com>
|
||||||
|
Date: Wed, 25 Jun 2025 14:23:44 +1200
|
||||||
|
Subject: Fix a data race in multi-threading mode (--num-threads=N)
|
||||||
|
References: bsc#1245569
|
||||||
|
Git-commit: 65bf4c9ef0fd0cbf2fb99b60e15b00d984b391b8
|
||||||
|
Upstream: merged
|
||||||
|
|
||||||
|
A vmcore corrupt issue has been noticed in powerpc arch [1]. It can be
|
||||||
|
reproduced with upstream makedumpfile.
|
||||||
|
|
||||||
|
When analyzing the corrupt vmcore using crash, the following error
|
||||||
|
message will output:
|
||||||
|
|
||||||
|
crash: compressed kdump: uncompress failed: 0
|
||||||
|
crash: read error: kernel virtual address: c0001e2d2fe48000 type:
|
||||||
|
"hardirq thread_union"
|
||||||
|
crash: cannot read hardirq_ctx[930] at c0001e2d2fe48000
|
||||||
|
crash: compressed kdump: uncompress failed: 0
|
||||||
|
|
||||||
|
If the vmcore is generated without num-threads option, then no such
|
||||||
|
errors are noticed.
|
||||||
|
|
||||||
|
With --num-threads=N enabled, there will be N sub-threads created. All
|
||||||
|
sub-threads are producers which responsible for mm page processing, e.g.
|
||||||
|
compression. The main thread is the consumer which responsible for
|
||||||
|
writing the compressed data into file. page_flag_buf->ready is used to
|
||||||
|
sync main and sub-threads. When a sub-thread finishes page processing,
|
||||||
|
it will set ready flag to be FLAG_READY. In the meantime, main thread
|
||||||
|
looply check all threads of the ready flags, and break the loop when
|
||||||
|
find FLAG_READY.
|
||||||
|
|
||||||
|
page_flag_buf->ready is read/write by main/sub-threads simultaneously,
|
||||||
|
but it is unprotected and unsafe. I have tested both mutex and atomic_rw
|
||||||
|
can fix this issue. This patch takes atomic_rw for its simplicity.
|
||||||
|
|
||||||
|
[1]: https://github.com/makedumpfile/makedumpfile/issues/15
|
||||||
|
|
||||||
|
Resolves: https://github.com/makedumpfile/makedumpfile/issues/15
|
||||||
|
Tested-by: Sourabh Jain <sourabhjain@linux.ibm.com>
|
||||||
|
Signed-off-by: Tao Liu <ltao@redhat.com>
|
||||||
|
Acked-by: Petr Tesarik <ptesarik@suse.com>
|
||||||
|
---
|
||||||
|
makedumpfile.c | 21 ++++++++++++++-------
|
||||||
|
1 file changed, 14 insertions(+), 7 deletions(-)
|
||||||
|
|
||||||
|
--- a/makedumpfile.c
|
||||||
|
+++ b/makedumpfile.c
|
||||||
|
@@ -8623,7 +8623,8 @@ kdump_thread_function_cyclic(void *arg)
|
||||||
|
|
||||||
|
while (buf_ready == FALSE) {
|
||||||
|
pthread_testcancel();
|
||||||
|
- if (page_flag_buf->ready == FLAG_READY)
|
||||||
|
+ if (__atomic_load_n(&page_flag_buf->ready,
|
||||||
|
+ __ATOMIC_SEQ_CST) == FLAG_READY)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* get next dumpable pfn */
|
||||||
|
@@ -8639,7 +8640,8 @@ kdump_thread_function_cyclic(void *arg)
|
||||||
|
info->current_pfn = pfn + 1;
|
||||||
|
|
||||||
|
page_flag_buf->pfn = pfn;
|
||||||
|
- page_flag_buf->ready = FLAG_FILLING;
|
||||||
|
+ __atomic_store_n(&page_flag_buf->ready, FLAG_FILLING,
|
||||||
|
+ __ATOMIC_SEQ_CST);
|
||||||
|
pthread_mutex_unlock(&info->current_pfn_mutex);
|
||||||
|
sem_post(&info->page_flag_buf_sem);
|
||||||
|
|
||||||
|
@@ -8728,7 +8730,8 @@ kdump_thread_function_cyclic(void *arg)
|
||||||
|
page_flag_buf->index = index;
|
||||||
|
buf_ready = TRUE;
|
||||||
|
next:
|
||||||
|
- page_flag_buf->ready = FLAG_READY;
|
||||||
|
+ __atomic_store_n(&page_flag_buf->ready, FLAG_READY,
|
||||||
|
+ __ATOMIC_SEQ_CST);
|
||||||
|
page_flag_buf = page_flag_buf->next;
|
||||||
|
|
||||||
|
}
|
||||||
|
@@ -8857,7 +8860,8 @@ write_kdump_pages_parallel_cyclic(struct
|
||||||
|
* current_pfn is used for recording the value of pfn when checking the pfn.
|
||||||
|
*/
|
||||||
|
for (i = 0; i < info->num_threads; i++) {
|
||||||
|
- if (info->page_flag_buf[i]->ready == FLAG_UNUSED)
|
||||||
|
+ if (__atomic_load_n(&info->page_flag_buf[i]->ready,
|
||||||
|
+ __ATOMIC_SEQ_CST) == FLAG_UNUSED)
|
||||||
|
continue;
|
||||||
|
temp_pfn = info->page_flag_buf[i]->pfn;
|
||||||
|
|
||||||
|
@@ -8865,7 +8869,8 @@ write_kdump_pages_parallel_cyclic(struct
|
||||||
|
* count how many threads have reached the end.
|
||||||
|
*/
|
||||||
|
if (temp_pfn >= end_pfn) {
|
||||||
|
- info->page_flag_buf[i]->ready = FLAG_UNUSED;
|
||||||
|
+ __atomic_store_n(&info->page_flag_buf[i]->ready,
|
||||||
|
+ FLAG_UNUSED, __ATOMIC_SEQ_CST);
|
||||||
|
end_count++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
@@ -8887,7 +8892,8 @@ write_kdump_pages_parallel_cyclic(struct
|
||||||
|
* If the page_flag_buf is not ready, the pfn recorded may be changed.
|
||||||
|
* So we should recheck.
|
||||||
|
*/
|
||||||
|
- if (info->page_flag_buf[consuming]->ready != FLAG_READY) {
|
||||||
|
+ if (__atomic_load_n(&info->page_flag_buf[consuming]->ready,
|
||||||
|
+ __ATOMIC_SEQ_CST) != FLAG_READY) {
|
||||||
|
clock_gettime(CLOCK_MONOTONIC, &new);
|
||||||
|
if (new.tv_sec - last.tv_sec > WAIT_TIME) {
|
||||||
|
ERRMSG("Can't get data of pfn.\n");
|
||||||
|
@@ -8929,7 +8935,8 @@ write_kdump_pages_parallel_cyclic(struct
|
||||||
|
goto out;
|
||||||
|
page_data_buf[index].used = FALSE;
|
||||||
|
}
|
||||||
|
- info->page_flag_buf[consuming]->ready = FLAG_UNUSED;
|
||||||
|
+ __atomic_store_n(&info->page_flag_buf[consuming]->ready,
|
||||||
|
+ FLAG_UNUSED, __ATOMIC_SEQ_CST);
|
||||||
|
info->page_flag_buf[consuming] = info->page_flag_buf[consuming]->next;
|
||||||
|
}
|
||||||
|
finish:
|
@@ -1,3 +1,9 @@
|
|||||||
|
-------------------------------------------------------------------
|
||||||
|
Fri Jul 18 12:40:21 UTC 2025 - Petr Tesařík <ptesarik@suse.com>
|
||||||
|
|
||||||
|
- makedumpfile-fix-multi-threading-data-race.patch: Fix a data race
|
||||||
|
in multi-threading mode (--num-threads=N) (bsc#1245569).
|
||||||
|
|
||||||
-------------------------------------------------------------------
|
-------------------------------------------------------------------
|
||||||
Thu May 29 11:10:50 UTC 2025 - Petr Tesařík <ptesarik@suse.com>
|
Thu May 29 11:10:50 UTC 2025 - Petr Tesařík <ptesarik@suse.com>
|
||||||
|
|
||||||
|
@@ -40,6 +40,7 @@ Source99: %{name}-rpmlintrc
|
|||||||
Patch0: %{name}-override-libtinfo.patch
|
Patch0: %{name}-override-libtinfo.patch
|
||||||
Patch1: %{name}-ppc64-VA-range-SUSE.patch
|
Patch1: %{name}-ppc64-VA-range-SUSE.patch
|
||||||
Patch2: %{name}-PN_XNUM.patch
|
Patch2: %{name}-PN_XNUM.patch
|
||||||
|
Patch3: %{name}-fix-multi-threading-data-race.patch
|
||||||
BuildRequires: libbz2-devel
|
BuildRequires: libbz2-devel
|
||||||
BuildRequires: libdw-devel
|
BuildRequires: libdw-devel
|
||||||
BuildRequires: libelf-devel
|
BuildRequires: libelf-devel
|
||||||
|
Reference in New Issue
Block a user