From caf2d5c87687919f9e3cc8093d98aff54d21f6e19e7cece9b108be03ef3c8f3b Mon Sep 17 00:00:00 2001 From: Nicolas Morey Date: Mon, 3 Nov 2025 18:46:42 +0100 Subject: [PATCH 1/8] Use external hwloc Signed-off-by: Nicolas Morey --- mpich.changes | 5 +++++ mpich.spec | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/mpich.changes b/mpich.changes index 2aed6ed..5fff4d9 100644 --- a/mpich.changes +++ b/mpich.changes @@ -1,3 +1,8 @@ +------------------------------------------------------------------- +Tue Nov 4 17:54:18 UTC 2025 - Nicolas Morey + +- Use external hwloc + ------------------------------------------------------------------- Tue Oct 28 17:54:04 UTC 2025 - Nicolas Morey diff --git a/mpich.spec b/mpich.spec index 9513b1a..77b39ef 100644 --- a/mpich.spec +++ b/mpich.spec @@ -101,6 +101,7 @@ BuildRequires: libtool BuildRequires: libtool BuildRequires: mpi-selector BuildRequires: python3-devel +BuildRequires: hwloc-devel >= 2.0 %if "%{build_flavor}" == "ofi" BuildRequires: libfabric-devel @@ -114,7 +115,6 @@ BuildRequires: libuct-devel >= 1.7.0 # UCX is only available for 64b archs ExcludeArch: %ix86 %arm %endif - Provides: mpi BuildRequires: Modules BuildRequires: gcc-c++ @@ -205,6 +205,7 @@ export FCFLAGS="-fallow-argument-mismatch $FCFLAGS" --sysconfdir=%{p_sysconfdir} \ --disable-rpath \ --disable-wrapper-rpath \ + --with-hwloc \ %if "%{build_flavor}" == "ofi" --with-ofi \ --with-device=ch4:ofi \ -- 2.51.1 From 323ae9025795a69d158324f38319527c627ecfb300e066501fb5aa239d6c2298 Mon Sep 17 00:00:00 2001 From: Nicolas Morey Date: Tue, 4 Nov 2025 19:03:23 +0100 Subject: [PATCH 2/8] Rename standard multibuild flavor to ucx for more clarity Signed-off-by: Nicolas Morey --- _multibuild | 4 ++-- mpich.changes | 1 + mpich.spec | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/_multibuild b/_multibuild index 606536d..a8e6772 100644 --- a/_multibuild +++ b/_multibuild @@ -1,6 +1,6 @@ - standard - testsuite + ucx + ucx-testsuite ofi ofi-testsuite diff --git a/mpich.changes b/mpich.changes index 5fff4d9..b9ba771 100644 --- a/mpich.changes +++ b/mpich.changes @@ -2,6 +2,7 @@ Tue Nov 4 17:54:18 UTC 2025 - Nicolas Morey - Use external hwloc +- Rename standard multibuild flavor to ucx for more clarity ------------------------------------------------------------------- Tue Oct 28 17:54:04 UTC 2025 - Nicolas Morey diff --git a/mpich.spec b/mpich.spec index 77b39ef..7211aef 100644 --- a/mpich.spec +++ b/mpich.spec @@ -29,10 +29,10 @@ ExclusiveArch: do_not_build %endif -%if "%{flavor}" == "standard" +%if "%{flavor}" == "ucx" %define build_flavor ucx %endif -%if "%{flavor}" == "testsuite" +%if "%{flavor}" == "ucx-testsuite" %define build_flavor ucx %define testsuite 1 %endif -- 2.51.1 From 2bb6e745cc864f75fd2fa52587807bd1a811beed21ea9ed1b0fe4770e53a33ed Mon Sep 17 00:00:00 2001 From: Nicolas Morey Date: Tue, 4 Nov 2025 21:22:41 +0100 Subject: [PATCH 3/8] use autoreconf This is much much faster than rerunning fully autogen.sh Signed-off-by: Nicolas Morey --- mpich.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mpich.spec b/mpich.spec index 7211aef..72683a4 100644 --- a/mpich.spec +++ b/mpich.spec @@ -191,7 +191,7 @@ export FFLAGS="-fallow-argument-mismatch $FFLAGS" export FCFLAGS="-fallow-argument-mismatch $FCFLAGS" %endif -./autogen.sh --without-ucx --without-ofi --without-json +autoreconf -fi %configure \ --prefix=%{p_prefix} \ --exec-prefix=%{p_prefix} \ -- 2.51.1 From 15f2f50e9f9720170f32adf369f91487ec80cf08551df2cf696b0df8f2e6c86b Mon Sep 17 00:00:00 2001 From: Nicolas Morey Date: Sat, 8 Nov 2025 20:07:38 +0100 Subject: [PATCH 4/8] disable CMA in make check CMA does not work in OBS with recent kernels as it requires special user capabilities Signed-off-by: Nicolas Morey --- mpich.spec | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mpich.spec b/mpich.spec index 72683a4..cb7832b 100644 --- a/mpich.spec +++ b/mpich.spec @@ -258,6 +258,9 @@ find %{buildroot} -name "*.a" -delete rm -rf %{buildroot}/* %check +# Disable CMA. Modern kernels require specific ptrace capabilities +# that are not available in OBS +export MPIR_CVAR_CH4_CMA_ENABLE=0 make check %else -- 2.51.1 From 43f9f2060707a699eb8d6b2c8490c255548d238f1005e177f5b302ac683b5f53 Mon Sep 17 00:00:00 2001 From: Nicolas Morey Date: Sat, 8 Nov 2025 23:38:38 +0100 Subject: [PATCH 5/8] Fix a datatype issue on s390x Signed-off-by: Nicolas Morey --- ...IDI_POSIX_mpi_release_gather_release.patch | 47 +++++++++++++++++++ mpich.changes | 2 + mpich.spec | 1 + 3 files changed, 50 insertions(+) create mode 100644 ch4-shm-fix-data-type-for-recv_bytes-in-MPIDI_POSIX_mpi_release_gather_release.patch diff --git a/ch4-shm-fix-data-type-for-recv_bytes-in-MPIDI_POSIX_mpi_release_gather_release.patch b/ch4-shm-fix-data-type-for-recv_bytes-in-MPIDI_POSIX_mpi_release_gather_release.patch new file mode 100644 index 0000000..282f867 --- /dev/null +++ b/ch4-shm-fix-data-type-for-recv_bytes-in-MPIDI_POSIX_mpi_release_gather_release.patch @@ -0,0 +1,47 @@ +commit 564c0affae6a79b3c99fce5717c402182d74daa1 +Author: Nicolas Morey +Date: Sat Nov 8 23:34:58 2025 +0100 + + ch4: shm: fix data type for recv_bytes in MPIDI_POSIX_mpi_release_gather_release + + The number of received bytes in release_gather_release is badly cast between + int and MPI_Aint. On most arch this is not an issue, but for Big-Endian 64b arch (s390x) + it ends up losing the actual value as we only copy the first 4 MSB. + Fix the issue by writing the whole MPI_AInt in the shm_buf instead of just an int. + + Signed-off-by: Nicolas Morey + +diff --git src/mpid/ch4/shm/posix/release_gather/release_gather.h src/mpid/ch4/shm/posix/release_gather/release_gather.h +index ac966cb9772e..ff1308830d00 100644 +--- src/mpid/ch4/shm/posix/release_gather/release_gather.h ++++ src/mpid/ch4/shm/posix/release_gather/release_gather.h +@@ -121,7 +121,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_release_gather_release(void *local_ + datatype, root, MPIR_BCAST_TAG, comm_ptr, &status); + MPIR_ERR_CHECK(mpi_errno); + MPIR_Get_count_impl(&status, MPI_BYTE, &recv_bytes); +- MPIR_Typerep_copy(bcast_data_addr, &recv_bytes, sizeof(int), ++ MPIR_Typerep_copy(bcast_data_addr, &recv_bytes, sizeof(MPI_Aint), + MPIR_TYPEREP_FLAG_NONE); + /* It is necessary to copy the errflag as well to handle the case when non-root + * becomes temporary root as part of compositions (or smp aware colls). These temp +@@ -146,7 +146,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_release_gather_release(void *local_ + /* When error checking is enabled, place the datasize in shm_buf first, followed by the + * errflag, followed by the actual data with an offset of (2*cacheline_size) bytes from + * the starting address */ +- MPIR_Typerep_copy(bcast_data_addr, &count, sizeof(int), MPIR_TYPEREP_FLAG_NONE); ++ MPIR_Typerep_copy(bcast_data_addr, &count, sizeof(MPI_Aint), MPIR_TYPEREP_FLAG_NONE); + /* It is necessary to copy the errflag as well to handle the case when non-root + * becomes root as part of compositions (or smp aware colls). These roots might + * expect same data as other ranks but different from the actual root. So only +@@ -218,8 +218,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_release_gather_release(void *local_ + * datasize is copied out from shm_buffer and compared against the count a rank was + * expecting. Also, the errflag is copied out. In case of mismatch mpi_errno is set. + * Actual data starts after (2*cacheline_size) bytes */ +- int recv_bytes, recv_errflag; +- MPIR_Typerep_copy(&recv_bytes, bcast_data_addr, sizeof(int), MPIR_TYPEREP_FLAG_NONE); ++ MPI_Aint recv_bytes; ++ int recv_errflag; ++ MPIR_Typerep_copy(&recv_bytes, bcast_data_addr, sizeof(MPI_Aint), MPIR_TYPEREP_FLAG_NONE); + MPIR_Typerep_copy(&recv_errflag, (char *) bcast_data_addr + MPIDU_SHM_CACHE_LINE_LEN, + sizeof(int), MPIR_TYPEREP_FLAG_NONE); + MPIR_ERR_CHKANDJUMP2(recv_bytes != count, mpi_errno, MPI_ERR_OTHER, diff --git a/mpich.changes b/mpich.changes index b9ba771..e6c3100 100644 --- a/mpich.changes +++ b/mpich.changes @@ -3,6 +3,8 @@ Tue Nov 4 17:54:18 UTC 2025 - Nicolas Morey - Use external hwloc - Rename standard multibuild flavor to ucx for more clarity +- Add ch4-shm-fix-data-type-for-recv_bytes-in-MPIDI_POSIX_mpi_release_gather_release.patch + to fix a datatype issue on s390x ------------------------------------------------------------------- Tue Oct 28 17:54:04 UTC 2025 - Nicolas Morey diff --git a/mpich.spec b/mpich.spec index cb7832b..64d41f1 100644 --- a/mpich.spec +++ b/mpich.spec @@ -84,6 +84,7 @@ Source101: README.md Patch1: autogen-only-deal-with-json-yaksa-if-enabled.patch Patch2: autoconf-pull-dynamic-and-not-static-libs-from-pkg-config.patch Patch3: romio-test-fix-bad-snprintf-arguments.patch +Patch4: ch4-shm-fix-data-type-for-recv_bytes-in-MPIDI_POSIX_mpi_release_gather_release.patch BuildRoot: %{_tmppath}/%{name}-%{version}-build BuildRequires: fdupes -- 2.51.1 From 0064fc5d6c4f23ad225a353eaf255a602d146f879e38c2c017c5459b477fad2d Mon Sep 17 00:00:00 2001 From: Nicolas Morey Date: Sun, 9 Nov 2025 00:20:19 +0100 Subject: [PATCH 6/8] Speed up testsuite by not rebuilding mpich completely Signed-off-by: Nicolas Morey --- mpich.changes | 1 + mpich.spec | 27 ++++++++++++++------------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/mpich.changes b/mpich.changes index e6c3100..3c191fb 100644 --- a/mpich.changes +++ b/mpich.changes @@ -5,6 +5,7 @@ Tue Nov 4 17:54:18 UTC 2025 - Nicolas Morey - Rename standard multibuild flavor to ucx for more clarity - Add ch4-shm-fix-data-type-for-recv_bytes-in-MPIDI_POSIX_mpi_release_gather_release.patch to fix a datatype issue on s390x +- Speed up testsuite by not rebuilding mpich completely ------------------------------------------------------------------- Tue Oct 28 17:54:04 UTC 2025 - Nicolas Morey diff --git a/mpich.spec b/mpich.spec index 64d41f1..e0e7509 100644 --- a/mpich.spec +++ b/mpich.spec @@ -220,6 +220,20 @@ autoreconf -fi MPICHLIB_CFLAGS="%{optflags}" \ MPICHLIB_CXXFLAGS="%{optflags}" +%if 0%{?testsuite} +%install +rm -rf %{buildroot}/* + +%check +# Disable CMA. Modern kernels require specific ptrace capabilities +# that are not available in OBS +export MPIR_CVAR_CH4_CMA_ENABLE=0 +for dir in src/mpl src/mpi/romio/test; do +( + cd $dir && make check +) + +%else make %{?_smp_mflags} VERBOSE=1 %install @@ -253,19 +267,6 @@ find %{buildroot} -name "*.a" -delete %fdupes %{buildroot}%{p_datadir} %fdupes %{buildroot}%{p_libdir}/pkgconfig -%if 0%{?testsuite} -# Remove everything from testsuite package -# It is all contained by mpich packages -rm -rf %{buildroot}/* - -%check -# Disable CMA. Modern kernels require specific ptrace capabilities -# that are not available in OBS -export MPIR_CVAR_CH4_CMA_ENABLE=0 -make check - -%else - # make and install mpivars files install -m 0755 -d %{buildroot}%{_bindir} sed -e 's,prefix,%p_prefix,g' -e 's,libdir,%{p_libdir},g' %{S:1} > %{buildroot}%{p_bindir}/mpivars.sh -- 2.51.1 From 83fcab613eeeb4dd184b7f899851134de8fc3638f34d74faeba9507c85a65615 Mon Sep 17 00:00:00 2001 From: Nicolas Morey Date: Sun, 9 Nov 2025 00:30:48 +0100 Subject: [PATCH 7/8] Fix random configure failure on aarch64 Signed-off-by: Nicolas Morey --- mpich.changes | 1 + mpich.spec | 13 +++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/mpich.changes b/mpich.changes index 3c191fb..dc76bb0 100644 --- a/mpich.changes +++ b/mpich.changes @@ -6,6 +6,7 @@ Tue Nov 4 17:54:18 UTC 2025 - Nicolas Morey - Add ch4-shm-fix-data-type-for-recv_bytes-in-MPIDI_POSIX_mpi_release_gather_release.patch to fix a datatype issue on s390x - Speed up testsuite by not rebuilding mpich completely +- Fix random configure failure on aarch64 ------------------------------------------------------------------- Tue Oct 28 17:54:04 UTC 2025 - Nicolas Morey diff --git a/mpich.spec b/mpich.spec index e0e7509..4ddbd17 100644 --- a/mpich.spec +++ b/mpich.spec @@ -191,7 +191,11 @@ rm -R modules/{ucx,libfabric,json-c} export FFLAGS="-fallow-argument-mismatch $FFLAGS" export FCFLAGS="-fallow-argument-mismatch $FCFLAGS" %endif - +%ifarch aarch64 +# For some reason, configure has random issue with defining this +# on aarch64 only. Set it to avoid random failure +export CROSS_F77_SIZEOF_INTEGER=4 +%endif autoreconf -fi %configure \ --prefix=%{p_prefix} \ @@ -229,9 +233,10 @@ rm -rf %{buildroot}/* # that are not available in OBS export MPIR_CVAR_CH4_CMA_ENABLE=0 for dir in src/mpl src/mpi/romio/test; do -( - cd $dir && make check -) + ( + cd $dir && make check + ) +done %else make %{?_smp_mflags} VERBOSE=1 -- 2.51.1 From 2f71a10b621be952686d67abd3706a36299b3cdd3b9f36f83f34fd5bedd66a2b Mon Sep 17 00:00:00 2001 From: Nicolas Morey Date: Sun, 9 Nov 2025 00:53:40 +0100 Subject: [PATCH 8/8] Fix FFLAGS Signed-off-by: Nicolas Morey --- mpich.changes | 1 + mpich.spec | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/mpich.changes b/mpich.changes index dc76bb0..d15cfe4 100644 --- a/mpich.changes +++ b/mpich.changes @@ -7,6 +7,7 @@ Tue Nov 4 17:54:18 UTC 2025 - Nicolas Morey to fix a datatype issue on s390x - Speed up testsuite by not rebuilding mpich completely - Fix random configure failure on aarch64 +- Fix FFLAGS ------------------------------------------------------------------- Tue Oct 28 17:54:04 UTC 2025 - Nicolas Morey diff --git a/mpich.spec b/mpich.spec index 4ddbd17..6752049 100644 --- a/mpich.spec +++ b/mpich.spec @@ -188,8 +188,8 @@ rm -R modules/{ucx,libfabric,json-c} # GCC10 needs an extra flag to allow badly passed parameters %if 0%{?suse_version} > 1500 -export FFLAGS="-fallow-argument-mismatch $FFLAGS" -export FCFLAGS="-fallow-argument-mismatch $FCFLAGS" +export FFLAGS="-fallow-argument-mismatch %{optflags}" +export FCFLAGS="-fallow-argument-mismatch %{optflags}" %endif %ifarch aarch64 # For some reason, configure has random issue with defining this -- 2.51.1