diff --git a/UCS-TIME-Add-math.h-to-provide-INFINITY.patch b/UCS-TIME-Add-math.h-to-provide-INFINITY.patch deleted file mode 100644 index f9591dc..0000000 --- a/UCS-TIME-Add-math.h-to-provide-INFINITY.patch +++ /dev/null @@ -1,18 +0,0 @@ -commit c49bd7a5d183a57f41c801c7f5c9691bcd7d23da -Author: Thomas Vegas -Date: Mon Jun 24 16:52:04 2024 +0300 - - UCS/TIME: Add math.h to provide INFINITY - -diff --git src/ucs/time/time.h src/ucs/time/time.h -index cff9810cdad8..c51362273f8d 100644 ---- src/ucs/time/time.h -+++ src/ucs/time/time.h -@@ -11,6 +11,7 @@ - #include - #include - #include -+#include - - BEGIN_C_DECLS - diff --git a/openucx-s390x-support.patch b/openucx-s390x-support.patch index b57c0bb..c950f7b 100644 --- a/openucx-s390x-support.patch +++ b/openucx-s390x-support.patch @@ -1,6 +1,6 @@ -commit 70e243c8a6685a03d5faa65e706d318196ad712b +commit ba1d7048df80ee535e01335992f70568e2f88c80 Author: Nicolas Morey -Date: Wed Jun 26 17:36:58 2024 +0200 +Date: Wed Feb 19 16:46:33 2025 +0100 openucx s390x support @@ -33,10 +33,10 @@ index e5e66266d695..ef7e4ede93ce 100644 AS_IF([test "x$bistro_hooks_happy" = "xyes"], [AC_DEFINE([UCM_BISTRO_HOOKS], [1], [Enable BISTRO hooks])], diff --git src/ucm/Makefile.am src/ucm/Makefile.am -index fa7a722f2d31..e6df414a4ecb 100644 +index 7866aa0ac13b..2d44e20f124d 100644 --- src/ucm/Makefile.am +++ src/ucm/Makefile.am -@@ -34,6 +34,7 @@ noinst_HEADERS = \ +@@ -35,6 +35,7 @@ noinst_HEADERS = \ bistro/bistro_aarch64.h \ bistro/bistro_ppc64.h \ bistro/bistro_rv64.h @@ -45,7 +45,7 @@ index fa7a722f2d31..e6df414a4ecb 100644 libucm_la_SOURCES = \ event/event.c \ diff --git src/ucm/bistro/bistro.h src/ucm/bistro/bistro.h -index 8d0b90751676..a0b9d3f064c3 100644 +index fffbe738b116..31859a84b159 100644 --- src/ucm/bistro/bistro.h +++ src/ucm/bistro/bistro.h @@ -23,6 +23,8 @@ typedef struct ucm_bistro_restore_point ucm_bistro_restore_point_t; @@ -91,7 +91,7 @@ index 000000000000..2beb5de54fab + +#endif diff --git src/ucs/Makefile.am src/ucs/Makefile.am -index 4a05f47b6369..c1cd2fb2cb57 100644 +index 86a469a60bcc..6751bad764b8 100644 --- src/ucs/Makefile.am +++ src/ucs/Makefile.am @@ -24,6 +24,7 @@ nobase_dist_libucs_la_HEADERS = \ @@ -140,7 +140,7 @@ index 849647902fab..a328c37e2020 100644 # error "Unsupported architecture" #endif diff --git src/ucs/arch/bitops.h src/ucs/arch/bitops.h -index 3e0e530f1336..f887e03ebac0 100644 +index f8e51c45888a..476631d95eb6 100644 --- src/ucs/arch/bitops.h +++ src/ucs/arch/bitops.h @@ -23,6 +23,8 @@ BEGIN_C_DECLS @@ -153,7 +153,7 @@ index 3e0e530f1336..f887e03ebac0 100644 # error "Unsupported architecture" #endif diff --git src/ucs/arch/cpu.c src/ucs/arch/cpu.c -index 307fb61bfc4a..4356fff36f8b 100644 +index 6fe5e31dba31..f92c53f303cd 100644 --- src/ucs/arch/cpu.c +++ src/ucs/arch/cpu.c @@ -64,6 +64,10 @@ const ucs_cpu_builtin_memcpy_t ucs_cpu_builtin_memcpy[UCS_CPU_VENDOR_LAST] = { @@ -167,15 +167,15 @@ index 307fb61bfc4a..4356fff36f8b 100644 [UCS_CPU_VENDOR_FUJITSU_ARM] = { .min = UCS_MEMUNITS_INF, .max = UCS_MEMUNITS_INF -@@ -89,6 +93,7 @@ const size_t ucs_cpu_est_bcopy_bw[UCS_CPU_VENDOR_LAST] = { - [UCS_CPU_VENDOR_GENERIC_ARM] = UCS_CPU_EST_BCOPY_BW_DEFAULT, - [UCS_CPU_VENDOR_GENERIC_PPC] = UCS_CPU_EST_BCOPY_BW_DEFAULT, - [UCS_CPU_VENDOR_GENERIC_RV64G] = UCS_CPU_EST_BCOPY_BW_DEFAULT, -+ [UCS_CPU_VENDOR_GENERIC_IBM] = UCS_CPU_EST_BCOPY_BW_DEFAULT, - [UCS_CPU_VENDOR_FUJITSU_ARM] = UCS_CPU_EST_BCOPY_BW_FUJITSU_ARM, - [UCS_CPU_VENDOR_ZHAOXIN] = UCS_CPU_EST_BCOPY_BW_DEFAULT, - [UCS_CPU_VENDOR_NVIDIA] = UCS_CPU_EST_BCOPY_BW_DEFAULT -@@ -183,6 +188,7 @@ const char *ucs_cpu_vendor_name() +@@ -82,7 +86,6 @@ const ucs_cpu_builtin_memcpy_t ucs_cpu_builtin_memcpy[UCS_CPU_VENDOR_LAST] = { + } + }; + +- + static void ucs_sysfs_get_cache_size() + { + char type_str[32]; /* Data/Instruction/Unified */ +@@ -167,6 +170,7 @@ const char *ucs_cpu_vendor_name() [UCS_CPU_VENDOR_GENERIC_ARM] = "Generic ARM", [UCS_CPU_VENDOR_GENERIC_PPC] = "Generic PPC", [UCS_CPU_VENDOR_GENERIC_RV64G] = "Generic RV64G", @@ -183,7 +183,7 @@ index 307fb61bfc4a..4356fff36f8b 100644 [UCS_CPU_VENDOR_FUJITSU_ARM] = "Fujitsu ARM", [UCS_CPU_VENDOR_ZHAOXIN] = "Zhaoxin", [UCS_CPU_VENDOR_NVIDIA] = "Nvidia" -@@ -212,6 +218,7 @@ const char *ucs_cpu_model_name() +@@ -197,6 +201,7 @@ const char *ucs_cpu_model_name() [UCS_CPU_MODEL_ZHAOXIN_WUDAOKOU] = "Wudaokou", [UCS_CPU_MODEL_ZHAOXIN_LUJIAZUI] = "Lujiazui", [UCS_CPU_MODEL_RV64G] = "RV64G", @@ -192,10 +192,10 @@ index 307fb61bfc4a..4356fff36f8b 100644 }; diff --git src/ucs/arch/cpu.h src/ucs/arch/cpu.h -index ca25e714d141..e97405c30d52 100644 +index 857b8b804cf7..89461d52d406 100644 --- src/ucs/arch/cpu.h +++ src/ucs/arch/cpu.h -@@ -39,6 +39,7 @@ typedef enum ucs_cpu_model { +@@ -41,6 +41,7 @@ typedef enum ucs_cpu_model { UCS_CPU_MODEL_ZHAOXIN_WUDAOKOU, UCS_CPU_MODEL_ZHAOXIN_LUJIAZUI, UCS_CPU_MODEL_RV64G, @@ -203,7 +203,7 @@ index ca25e714d141..e97405c30d52 100644 UCS_CPU_MODEL_NVIDIA_GRACE, UCS_CPU_MODEL_LAST } ucs_cpu_model_t; -@@ -68,6 +69,7 @@ typedef enum ucs_cpu_vendor { +@@ -70,6 +71,7 @@ typedef enum ucs_cpu_vendor { UCS_CPU_VENDOR_AMD, UCS_CPU_VENDOR_GENERIC_ARM, UCS_CPU_VENDOR_GENERIC_PPC, @@ -211,7 +211,7 @@ index ca25e714d141..e97405c30d52 100644 UCS_CPU_VENDOR_FUJITSU_ARM, UCS_CPU_VENDOR_ZHAOXIN, UCS_CPU_VENDOR_GENERIC_RV64G, -@@ -107,6 +109,8 @@ typedef struct ucs_cpu_builtin_memcpy { +@@ -109,6 +111,8 @@ typedef struct ucs_cpu_builtin_memcpy { # include "aarch64/cpu.h" #elif defined(__riscv) # include "rv64/cpu.h" @@ -278,10 +278,10 @@ index 000000000000..ce48ff1ff451 +#endif diff --git src/ucs/arch/s390x/cpu.h src/ucs/arch/s390x/cpu.h new file mode 100644 -index 000000000000..0aee278010d2 +index 000000000000..e1d41a0ef8b8 --- /dev/null +++ src/ucs/arch/s390x/cpu.h -@@ -0,0 +1,84 @@ +@@ -0,0 +1,86 @@ +/** +* Copyright (C) Mellanox Technologies Ltd. 2001-2013. ALL RIGHTS RESERVED. +* Copyright (C) ARM Ltd. 2016-2017. ALL RIGHTS RESERVED. @@ -347,7 +347,9 @@ index 000000000000..0aee278010d2 +{ +} + -+static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len) ++static inline void *ucs_memcpy_relaxed(void *dst, const void *src, size_t len, ++ ucs_arch_memcpy_hint_t hint, ++ size_t total_len) +{ + return memcpy(dst, src, len); +} @@ -428,7 +430,7 @@ index 000000000000..225e4e5e896a +#endif + diff --git src/ucs/sys/sys.c src/ucs/sys/sys.c -index 42ff75f64af5..b22418e3f4b0 100644 +index d0b5effe11a3..ce22a2097f18 100644 --- src/ucs/sys/sys.c +++ src/ucs/sys/sys.c @@ -1258,8 +1258,19 @@ void *ucs_sys_realloc(void *old_ptr, size_t old_length, size_t new_length) diff --git a/openucx.changes b/openucx.changes index d07004b..60bdb9c 100644 --- a/openucx.changes +++ b/openucx.changes @@ -1,3 +1,128 @@ +------------------------------------------------------------------- +Wed Feb 19 15:47:23 UTC 2025 - Nicolas Morey + +- Update to ucx 1.18.0 + - UCP + - Enabled using CUDA staging buffers for pipeline protocols by default + - Added endpoint reconfiguration support for non-reused p2p scenarios + - Enabled non-cacheable memory domains, activated for gdr_copy + - Added user_data parameter to ucp_ep_query + - Added support for host memory pipeline through CUDA buffers for rendezvous protocol + - Added global VA infrastructure and memory region in absence of error handling + - Made protocol performance node names more informative + - Enforced always running on the same thread in single thread mode + - Multiple improvements in protocols selection infrastructure + - Added UCP_MEM_MAP_LOCK API flag to enforce locked memory mapping + - Allowed up-to 64 endpoint lanes for systems with many transports or devices + - Added usage tracker to worker + - Improved various logging messages + - Fixed stack overflow in exported rkey unpack + - Removed extra remote-cpu overhead from protocol estimation for zcopy + - Fixed performance estimation for rndv pipeline protocols + - Fixed ATP sending by picking the correct lane + - Fixed missing reg_id on memh creation + - Fixed repeated invalidations by retaining existing access flags + - Fixed abort reason propagation for rendezvous RTR mtype + - Do not check transport availability if it is disabled by UCX_TLS environment variable + - Fixed wrong flag being used for checking BCOPY capability + - Fixed sending too many ATPs for small messages + - Enforced 16 bits size for Active Messages identifiers + - Fixed unnecessary status check for emulated AMO + - Fixed more than one fragment sending in rendezvous pipeline + - Fixed crash by using biggest max frag across all lanes + - Fixed missing memory handle flags by copying from parent to child + - Fixed worker interface activate count + - Fixed flush requests by replacing ATP/flush lane map with lane indexes + - Fixed lost uct_flags when merging memory regions + - UCT + - Fixed memory domain UCT flags description + - RDMA CORE (IB, ROCE, etc.) + - Added environment variable to manage DC initiator capacity + - Added DC dcs_hybrid policy + - Reduced MLX5/DV stack size consumption + - Added ODP support for verbs and mlx5dv + - Added support of CUDA managed memory on IB when ODP is available + - Added support of Adaptive Routing on RoCE + - Enabled use of implicit ODP with relaxed ordering + - Improved GPU-Direct detection in IB transport + - Increased DC initiator default count to 32 for performance optimization + - Added ConnectX-8 device support with DDP + - Added support for subnet filter list for RoCE interfaces + - Enhanced the error message to provide more details when a connection cannot be + established due to unreachable transports + - Added IB MLX5 as a separate UCX module with separate RPM sub-package + - Added initial support for GGA transport, for fast DPU memory access + - Set IB DevX atomic mode based on device capabilities + - Removed DC keepalive mechanism, since the keepalive is done on UCP layer + - Optimized cross-gVMI memory registration using indirect memory keys cache + - Improved various logging messages + - Fixed FETCH_ADD remote access error for ODP/KSM case + - Fixed missing conditional compilation checks for DM + - Fixed IB MD allocation naming typo + - Fixed invalid GIDs filter in IB + - Fixed flags usage in MLX5 zcopy_post + - Do not limit ODP registration retries + - Fixed JUCX failures by considering the number of supported completion vectors + - UCS + - Added support for wildcards in configuration parameter names + - Added ASAN protection to several internal data structures + - Reduced stack usage in topology detection code + - Improved bitmaps configuration parsing with wider bitfield + - Added options to set topology distance between devices + - Optimized VFS unix socket watch by using user private folder + - Added general IP subnet matching infrastructure + - Extend array data structure to support user-provided array copy routine + - Improved time units description + - Fixed a crash by using heap allocation to process expired timers in batch + - Fixed allocation issue on memtrack dump + - Fixed deletion of the monitored folder in VFS + - Fixed unsafe resize for DC initiator array + - Fixed function macro invocation to match C standard + - Fixed calling async handler on already released resource + - Fixed performance by setting higher bandwidth for different NUMA nodes on Grace + - Fixed undeclared value error in timer conversion routine + - Fixed uninitialized value access in registration cache + - UCM + - Extend CUDA memory hooks to include memory mapping APIs + - Fixed race condition in parsing proc maps + - Fixed mremap failure while parsing /proc/self/maps + - TCP + - Always bind endpoint to interface + - Tools + - Improved performance by increasing window size for put_bw and add get_bw in ucx_perftest + - Added multi-send flag for receive operations in bandwidth benchmarks in ucx_perftest + - Improved ucx_perftest uni-directional test with added fence + - Detailed ucx_perftest batch section of command-line documentation + - Fixed buffer size potential overflow in ucx_perftest + - Fixed missing address when packing memory keys on ucx_perftest + - Fixed memory leak for endpoint report in ucx_info + - Fixed build without openmp in ucx_perftest + - Fixed UCT device override on server side on ucx_perftest + - Documentation + - Added a section regarding adaptive routing on RoCE + - Architecture + - Added CPU Model for MI300A + - Added Fujitsu ARM specific values to ucx.conf + - Added AMD Turin support + - Added an optimized non-temporal memory copy implementation for AMD CPU + - Build + - Improved compiler error reporting with added flag + - Improved coverity script to allow faster turnaround time + - Improved Intel Compiler detection and support + - Fixed using correct ASAN version for running tests + - Configuration + - Used POSIX bourne syntax to check equality + - Fixed build failure by using proper flags in compiler.m4 + - Fixed perftest MAD support default guessing + - GO + - Added multi-send flag and user memh support in request params + - Added serialized thread mode to avoid subtle races between threads + - Fixed make distcheck + - Packaging + - Improved dpkg-buildpackage sample command by explicitly adding mlx5 related arguments +- Delete UCS-TIME-Add-math.h-to-provide-INFINITY.patch which was merged upstream +- Refresh openucx-s390x-support.patch due to API changes + ------------------------------------------------------------------- Sat Sep 7 14:22:20 UTC 2024 - Nicolas Morey diff --git a/openucx.spec b/openucx.spec index 70bd83f..d03d831 100644 --- a/openucx.spec +++ b/openucx.spec @@ -1,7 +1,7 @@ # # spec file for package openucx # -# Copyright (c) 2024 SUSE LLC +# Copyright (c) 2025 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -20,7 +20,7 @@ %define version_suf %{nil} Name: openucx -Version: 1.17.0 +Version: 1.18.0 Release: 0 Summary: Communication layer for Message Passing (MPI) License: BSD-3-Clause @@ -32,7 +32,6 @@ URL: http://openucx.org/ Source: https://github.com/openucx/ucx/releases/download/v%version%{?version_suf}/ucx-%version.tar.gz Patch1: openucx-s390x-support.patch Patch2: ucm-fix-UCX_MEM_MALLOC_RELOC.patch -Patch3: UCS-TIME-Add-math.h-to-provide-INFINITY.patch BuildRequires: autoconf >= 2.63 BuildRequires: automake >= 1.10 BuildRequires: binutils-devel @@ -158,7 +157,8 @@ export UCX_CFLAGS="$UCX_CFLAGS -mno-sse -mno-sse2" --disable-debug --disable-assertions \ --disable-params-check \ --with-rc --with-ud --with-dc \ - --with-mlx5-dv --with-rdmacm + --with-ib-hw-tm --with-dm --with-devx \ + --with-mlx5 --with-rdmacm # Override BASE_CFLAGS to disable Werror (boo#1121267) make %{?_smp_mflags} V=1 BASE_CFLAGS="-g -Wall" @@ -230,6 +230,7 @@ mv %buildroot/%_bindir/io_demo %buildroot/%_libexecdir/%{name}/ %_libdir/libuct.so.* %dir %_libdir/ucx/ %_libdir/ucx/libuct_*.so.* +%_libdir/ucx/libucx_perftest_mad.so.* %files -n libuct-devel %defattr(-,root,root) @@ -237,9 +238,11 @@ mv %buildroot/%_bindir/io_demo %buildroot/%_libexecdir/%{name}/ %_libdir/libuct.so %dir %_libdir/ucx/ %_libdir/ucx/libuct_*.so +%_libdir/ucx/libucx_perftest_mad.so %_libdir/pkgconfig/ucx-uct.pc %_libdir/pkgconfig/ucx-cma.pc %_libdir/pkgconfig/ucx-ib.pc +%_libdir/pkgconfig/ucx-ib-mlx5.pc %_libdir/pkgconfig/ucx-rdmacm.pc %changelog diff --git a/ucx-1.17.0.tar.gz b/ucx-1.17.0.tar.gz deleted file mode 100644 index 41862a4..0000000 --- a/ucx-1.17.0.tar.gz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34658e282f99f89ce7a991c542e9727552734ac6ad408c52f22b4c2653b04276 -size 3249625 diff --git a/ucx-1.18.0.tar.gz b/ucx-1.18.0.tar.gz new file mode 100644 index 0000000..abd0c93 --- /dev/null +++ b/ucx-1.18.0.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa75070f5fa7442731b4ef5fc9549391e147ed3d859afeb1dad2d4513b39dc33 +size 3307355