SHA256
6
0
forked from pool/mpich

Fixes #4

Manually merged
HPC merged 8 commits from NMorey/mpich:main into main 2025-11-09 02:18:11 +01:00
4 changed files with 88 additions and 19 deletions

View File

@@ -1,6 +1,6 @@
<multibuild>
<package>standard</package>
<package>testsuite</package>
<package>ucx</package>
<package>ucx-testsuite</package>
<package>ofi</package>
<package>ofi-testsuite</package>
</multibuild>

View File

@@ -0,0 +1,47 @@
commit 564c0affae6a79b3c99fce5717c402182d74daa1
Author: Nicolas Morey <nmorey@suse.com>
Date: Sat Nov 8 23:34:58 2025 +0100
ch4: shm: fix data type for recv_bytes in MPIDI_POSIX_mpi_release_gather_release
The number of received bytes in release_gather_release is badly cast between
int and MPI_Aint. On most arch this is not an issue, but for Big-Endian 64b arch (s390x)
it ends up losing the actual value as we only copy the first 4 MSB.
Fix the issue by writing the whole MPI_AInt in the shm_buf instead of just an int.
Signed-off-by: Nicolas Morey <nmorey@suse.com>
diff --git src/mpid/ch4/shm/posix/release_gather/release_gather.h src/mpid/ch4/shm/posix/release_gather/release_gather.h
index ac966cb9772e..ff1308830d00 100644
--- src/mpid/ch4/shm/posix/release_gather/release_gather.h
+++ src/mpid/ch4/shm/posix/release_gather/release_gather.h
@@ -121,7 +121,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_release_gather_release(void *local_
datatype, root, MPIR_BCAST_TAG, comm_ptr, &status);
MPIR_ERR_CHECK(mpi_errno);
MPIR_Get_count_impl(&status, MPI_BYTE, &recv_bytes);
- MPIR_Typerep_copy(bcast_data_addr, &recv_bytes, sizeof(int),
+ MPIR_Typerep_copy(bcast_data_addr, &recv_bytes, sizeof(MPI_Aint),
MPIR_TYPEREP_FLAG_NONE);
/* It is necessary to copy the errflag as well to handle the case when non-root
* becomes temporary root as part of compositions (or smp aware colls). These temp
@@ -146,7 +146,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_release_gather_release(void *local_
/* When error checking is enabled, place the datasize in shm_buf first, followed by the
* errflag, followed by the actual data with an offset of (2*cacheline_size) bytes from
* the starting address */
- MPIR_Typerep_copy(bcast_data_addr, &count, sizeof(int), MPIR_TYPEREP_FLAG_NONE);
+ MPIR_Typerep_copy(bcast_data_addr, &count, sizeof(MPI_Aint), MPIR_TYPEREP_FLAG_NONE);
/* It is necessary to copy the errflag as well to handle the case when non-root
* becomes root as part of compositions (or smp aware colls). These roots might
* expect same data as other ranks but different from the actual root. So only
@@ -218,8 +218,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_release_gather_release(void *local_
* datasize is copied out from shm_buffer and compared against the count a rank was
* expecting. Also, the errflag is copied out. In case of mismatch mpi_errno is set.
* Actual data starts after (2*cacheline_size) bytes */
- int recv_bytes, recv_errflag;
- MPIR_Typerep_copy(&recv_bytes, bcast_data_addr, sizeof(int), MPIR_TYPEREP_FLAG_NONE);
+ MPI_Aint recv_bytes;
+ int recv_errflag;
+ MPIR_Typerep_copy(&recv_bytes, bcast_data_addr, sizeof(MPI_Aint), MPIR_TYPEREP_FLAG_NONE);
MPIR_Typerep_copy(&recv_errflag, (char *) bcast_data_addr + MPIDU_SHM_CACHE_LINE_LEN,
sizeof(int), MPIR_TYPEREP_FLAG_NONE);
MPIR_ERR_CHKANDJUMP2(recv_bytes != count, mpi_errno, MPI_ERR_OTHER,

View File

@@ -1,3 +1,14 @@
-------------------------------------------------------------------
Tue Nov 4 17:54:18 UTC 2025 - Nicolas Morey <nicolas.morey@suse.com>
- Use external hwloc
- Rename standard multibuild flavor to ucx for more clarity
- Add ch4-shm-fix-data-type-for-recv_bytes-in-MPIDI_POSIX_mpi_release_gather_release.patch
to fix a datatype issue on s390x
- Speed up testsuite by not rebuilding mpich completely
- Fix random configure failure on aarch64
- Fix FFLAGS
-------------------------------------------------------------------
Tue Oct 28 17:54:04 UTC 2025 - Nicolas Morey <nicolas.morey@suse.com>

View File

@@ -29,10 +29,10 @@
ExclusiveArch: do_not_build
%endif
%if "%{flavor}" == "standard"
%if "%{flavor}" == "ucx"
%define build_flavor ucx
%endif
%if "%{flavor}" == "testsuite"
%if "%{flavor}" == "ucx-testsuite"
%define build_flavor ucx
%define testsuite 1
%endif
@@ -84,6 +84,7 @@ Source101: README.md
Patch1: autogen-only-deal-with-json-yaksa-if-enabled.patch
Patch2: autoconf-pull-dynamic-and-not-static-libs-from-pkg-config.patch
Patch3: romio-test-fix-bad-snprintf-arguments.patch
Patch4: ch4-shm-fix-data-type-for-recv_bytes-in-MPIDI_POSIX_mpi_release_gather_release.patch
BuildRoot: %{_tmppath}/%{name}-%{version}-build
BuildRequires: fdupes
@@ -101,6 +102,7 @@ BuildRequires: libtool
BuildRequires: libtool
BuildRequires: mpi-selector
BuildRequires: python3-devel
BuildRequires: hwloc-devel >= 2.0
%if "%{build_flavor}" == "ofi"
BuildRequires: libfabric-devel
@@ -114,7 +116,6 @@ BuildRequires: libuct-devel >= 1.7.0
# UCX is only available for 64b archs
ExcludeArch: %ix86 %arm
%endif
Provides: mpi
BuildRequires: Modules
BuildRequires: gcc-c++
@@ -187,11 +188,15 @@ rm -R modules/{ucx,libfabric,json-c}
# GCC10 needs an extra flag to allow badly passed parameters
%if 0%{?suse_version} > 1500
export FFLAGS="-fallow-argument-mismatch $FFLAGS"
export FCFLAGS="-fallow-argument-mismatch $FCFLAGS"
export FFLAGS="-fallow-argument-mismatch %{optflags}"
export FCFLAGS="-fallow-argument-mismatch %{optflags}"
%endif
./autogen.sh --without-ucx --without-ofi --without-json
%ifarch aarch64
# For some reason, configure has random issue with defining this
# on aarch64 only. Set it to avoid random failure
export CROSS_F77_SIZEOF_INTEGER=4
%endif
autoreconf -fi
%configure \
--prefix=%{p_prefix} \
--exec-prefix=%{p_prefix} \
@@ -205,6 +210,7 @@ export FCFLAGS="-fallow-argument-mismatch $FCFLAGS"
--sysconfdir=%{p_sysconfdir} \
--disable-rpath \
--disable-wrapper-rpath \
--with-hwloc \
%if "%{build_flavor}" == "ofi"
--with-ofi \
--with-device=ch4:ofi \
@@ -218,6 +224,21 @@ export FCFLAGS="-fallow-argument-mismatch $FCFLAGS"
MPICHLIB_CFLAGS="%{optflags}" \
MPICHLIB_CXXFLAGS="%{optflags}"
%if 0%{?testsuite}
%install
rm -rf %{buildroot}/*
%check
# Disable CMA. Modern kernels require specific ptrace capabilities
# that are not available in OBS
export MPIR_CVAR_CH4_CMA_ENABLE=0
for dir in src/mpl src/mpi/romio/test; do
(
cd $dir && make check
)
done
%else
make %{?_smp_mflags} VERBOSE=1
%install
@@ -251,16 +272,6 @@ find %{buildroot} -name "*.a" -delete
%fdupes %{buildroot}%{p_datadir}
%fdupes %{buildroot}%{p_libdir}/pkgconfig
%if 0%{?testsuite}
# Remove everything from testsuite package
# It is all contained by mpich packages
rm -rf %{buildroot}/*
%check
make check
%else
# make and install mpivars files
install -m 0755 -d %{buildroot}%{_bindir}
sed -e 's,prefix,%p_prefix,g' -e 's,libdir,%{p_libdir},g' %{S:1} > %{buildroot}%{p_bindir}/mpivars.sh