diff --git a/0001-tensile-gfx1153.patch b/0001-tensile-gfx1153.patch new file mode 100644 index 0000000..0f92bfb --- /dev/null +++ b/0001-tensile-gfx1153.patch @@ -0,0 +1,174 @@ +From 6d6d64c3065b3485ff764c4e368849ba0d41d0e4 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sun, 27 Jul 2025 13:06:25 -0700 +Subject: [PATCH] tensile gfx1153 + +cut-n-paste from gfx1152 + +Signed-off-by: Tom Rix +--- + Tensile/AsmCaps.py | 44 +++++++++++++++++++ + Tensile/Common.py | 6 +-- + Tensile/Source/CMakeLists.txt | 4 +- + Tensile/Source/lib/include/Tensile/AMDGPU.hpp | 7 +++ + .../include/Tensile/PlaceholderLibrary.hpp | 3 ++ + 5 files changed, 59 insertions(+), 5 deletions(-) + +diff --git a/Tensile/AsmCaps.py b/Tensile/AsmCaps.py +index ea0518752ac4..83514e6d8f03 100644 +--- a/Tensile/AsmCaps.py ++++ b/Tensile/AsmCaps.py +@@ -1035,6 +1035,50 @@ CACHED_ASM_CAPS = \ + 'v_mov_b64': False, + 'v_pk_fma_f16': True, + 'v_pk_fmac_f16': False}, ++ (11, 5, 3): {'HasAddLshl': True, ++ 'HasAtomicAdd': True, ++ 'HasDirectToLdsDest': False, ++ 'HasDirectToLdsNoDest': False, ++ 'HasExplicitCO': True, ++ 'HasExplicitNC': True, ++ 'HasGLCModifier': True, ++ 'HasNTModifier': False, ++ 'HasLshlOr': True, ++ 'HasMFMA': False, ++ 'HasMFMA_b8': False, ++ 'HasMFMA_bf16_1k': False, ++ 'HasMFMA_bf16_original': False, ++ 'HasMFMA_constSrc': False, ++ 'HasMFMA_f64': False, ++ 'HasMFMA_f8': False, ++ 'HasMFMA_i8_908': False, ++ 'HasMFMA_i8_940': False, ++ 'HasMFMA_vgpr': False, ++ 'HasMFMA_xf32': False, ++ 'HasSMulHi': True, ++ 'HasWMMA': True, ++ 'KernargPreloading': False, ++ 'MaxLgkmcnt': 15, ++ 'MaxVmcnt': 63, ++ 'SupportedISA': True, ++ 'SupportedSource': True, ++ 'VOP3v_dot4_i32_i8': False, ++ 'v_dot2_f32_f16': True, ++ 'v_dot2c_f32_f16': True, ++ 'v_dot4_i32_i8': False, ++ 'v_dot4c_i32_i8': False, ++ 'v_fma_f16': True, ++ 'v_fma_f32': True, ++ 'v_fma_f64': True, ++ 'v_fma_mix_f32': True, ++ 'v_fmac_f16': False, ++ 'v_fmac_f32': True, ++ 'v_mac_f16': False, ++ 'v_mac_f32': False, ++ 'v_mad_mix_f32': False, ++ 'v_mov_b64': False, ++ 'v_pk_fma_f16': True, ++ 'v_pk_fmac_f16': False}, + (12, 0, 0): {'HasAddLshl': True, + 'HasAtomicAdd': False, + 'HasDirectToLdsDest': False, +diff --git a/Tensile/Common.py b/Tensile/Common.py +index 107dcb272c61..f66e7a274953 100644 +--- a/Tensile/Common.py ++++ b/Tensile/Common.py +@@ -247,7 +247,7 @@ globalParameters["SupportedISA"] = [(8,0,3), + (9,4,0), (9,4,1), (9,4,2), (9,5,0), + (10,1,0), (10,1,1), (10,1,2), (10,3,0), (10,3,1), (10,3,5), + (11,0,0), (11,0,1), (11,0,2), (11,0,3), +- (11,5,0), (11,5,1), (11,5,2), ++ (11,5,0), (11,5,1), (11,5,2), (11,5,3), + (12,0,0), (12,0,1)] # assembly kernels writer supports these architectures + + globalParameters["KeepBuildTmp"] = True # Do not remove build artifacts during the build process or build_tmp after build completes +@@ -325,7 +325,7 @@ architectureMap = { + 'gfx1010':'navi10', 'gfx1011':'navi12', 'gfx1012':'navi14', + 'gfx1030':'navi21', 'gfx1031':'navi22', 'gfx1032':'navi23', 'gfx1034':'navi24', 'gfx1035':'rembrandt', + 'gfx1100':'navi31', 'gfx1101':'navi32', 'gfx1102':'navi33', +- 'gfx1103':'gfx1103', 'gfx1150':'gfx1150', 'gfx1151':'gfx1151', 'gfx1152':'gfx1152', ++ 'gfx1103':'gfx1103', 'gfx1150':'gfx1150', 'gfx1151':'gfx1151', 'gfx1152':'gfx1152', 'gfx1153':'gfx1153', + 'gfx1200':'gfx1200', + 'gfx1201':'gfx1201' + } +@@ -2467,7 +2467,7 @@ def assignGlobalParameters( config, capabilitiesCache: Optional[dict] = None ): + if os.name == "nt": + globalParameters["CurrentISA"] = (9,0,6) + printWarning("Failed to detect ISA so forcing (gfx906) on windows") +- isasWithDisabledHWMonitor = ((9,4,1), (9,4,2), (9,5,0), (11,0,0), (11,0,1), (11,0,2), (11,0,3), (11,5,0), (11,5,1), (11,5,2), (12,0,0), (12,0,1)) ++ isasWithDisabledHWMonitor = ((9,4,1), (9,4,2), (9,5,0), (11,0,0), (11,0,1), (11,0,2), (11,0,3), (11,5,0), (11,5,1), (11,5,2), (11,5,3), (12,0,0), (12,0,1)) + if globalParameters["CurrentISA"] in isasWithDisabledHWMonitor: + isaString = ', '.join(map(gfxName, isasWithDisabledHWMonitor)) + printWarning(f"HardwareMonitor currently disabled for {isaString}") +diff --git a/Tensile/Source/CMakeLists.txt b/Tensile/Source/CMakeLists.txt +index 7f10ee319518..1002b29bb981 100644 +--- a/Tensile/Source/CMakeLists.txt ++++ b/Tensile/Source/CMakeLists.txt +@@ -51,9 +51,9 @@ if(CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" OR CMAKE_CXX_COMPILER MATCHES ".*clang + endif() + + if(CMAKE_CXX_COMPILER STREQUAL "hipcc") +- set(TENSILE_GPU_ARCHS gfx803 gfx900 gfx906:xnack- gfx908:xnack- gfx90a:xnack- gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 CACHE STRING "GPU architectures") ++ set(TENSILE_GPU_ARCHS gfx803 gfx900 gfx906:xnack- gfx908:xnack- gfx90a:xnack- gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1153 CACHE STRING "GPU architectures") + else() +- set(TENSILE_GPU_ARCHS gfx803 gfx900 gfx906 gfx908 gfx90a gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 CACHE STRING "GPU architectures") ++ set(TENSILE_GPU_ARCHS gfx803 gfx900 gfx906 gfx908 gfx90a gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1153 CACHE STRING "GPU architectures") + endif() + + include(CMakeDependentOption) +diff --git a/Tensile/Source/lib/include/Tensile/AMDGPU.hpp b/Tensile/Source/lib/include/Tensile/AMDGPU.hpp +index e65a4831e082..5e0a43c0fdab 100644 +--- a/Tensile/Source/lib/include/Tensile/AMDGPU.hpp ++++ b/Tensile/Source/lib/include/Tensile/AMDGPU.hpp +@@ -79,6 +79,7 @@ namespace Tensile + gfx1150 = 1150, + gfx1151 = 1151, + gfx1152 = 1152, ++ gfx1153 = 1153, + }; + + static std::string toString(Processor p) +@@ -133,6 +134,8 @@ namespace Tensile + return "gfx1151"; + case AMDGPU::Processor::gfx1152: + return "gfx1152"; ++ case AMDGPU::Processor::gfx1153: ++ return "gfx1153"; + } + return ""; + } +@@ -219,6 +222,10 @@ namespace Tensile + { + return AMDGPU::Processor::gfx1152; + } ++ else if(deviceString.find("gfx1153") != std::string::npos) ++ { ++ return AMDGPU::Processor::gfx1153; ++ } + else + { + return static_cast(0); +diff --git a/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp b/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp +index ba9719f77bb2..3d1d43e0c9d0 100644 +--- a/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp ++++ b/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp +@@ -63,6 +63,7 @@ namespace Tensile + gfx1151, + gfx1150, + gfx1152, ++ gfx1153, + All + }; + +@@ -121,6 +122,8 @@ namespace Tensile + return "TensileLibrary_*_gfx1151"; + case LazyLoadingInit::gfx1152: + return "TensileLibrary_*_gfx1152"; ++ case LazyLoadingInit::gfx1153: ++ return "TensileLibrary_*_gfx1153"; + case LazyLoadingInit::None: + return ""; + } +-- +2.50.1 + diff --git a/0001-tensile-gfx950.patch b/0001-tensile-gfx950.patch new file mode 100644 index 0000000..235af77 --- /dev/null +++ b/0001-tensile-gfx950.patch @@ -0,0 +1,180 @@ +From e6851f038000be90cd29f3d530834e35111351c3 Mon Sep 17 00:00:00 2001 +From: Tom Rix +Date: Sun, 27 Jul 2025 12:20:36 -0700 +Subject: [PATCH] tensile gfx950 + +Copy gfx950 from the develop branch at commit +01ab9e776518ff8fda3a0086a3f3f9d17cd95f59 + +Signed-off-by: Tom Rix +--- + Tensile/AsmCaps.py | 44 +++++++++++++++++++ + Tensile/Common.py | 15 ++++--- + Tensile/Source/lib/include/Tensile/AMDGPU.hpp | 7 +++ + .../include/Tensile/PlaceholderLibrary.hpp | 3 ++ + 4 files changed, 62 insertions(+), 7 deletions(-) + +diff --git a/Tensile/AsmCaps.py b/Tensile/AsmCaps.py +index 78ffa73bd81a..ea0518752ac4 100644 +--- a/Tensile/AsmCaps.py ++++ b/Tensile/AsmCaps.py +@@ -419,6 +419,50 @@ CACHED_ASM_CAPS = \ + 'v_mov_b64': True, + 'v_pk_fma_f16': True, + 'v_pk_fmac_f16': False}, ++ (9, 5, 0): {'HasAddLshl': True, ++ 'HasAtomicAdd': True, ++ 'HasDirectToLdsDest': False, ++ 'HasDirectToLdsNoDest': True, ++ 'HasExplicitCO': True, ++ 'HasExplicitNC': False, ++ 'HasGLCModifier': False, ++ 'HasNTModifier': True, ++ 'HasLshlOr': True, ++ 'HasMFMA': True, ++ 'HasMFMA_b8': True, ++ 'HasMFMA_bf16_1k': True, ++ 'HasMFMA_bf16_original': False, ++ 'HasMFMA_constSrc': True, ++ 'HasMFMA_f64': True, ++ 'HasMFMA_f8': True, ++ 'HasMFMA_i8_908': False, ++ 'HasMFMA_i8_940': True, ++ 'HasMFMA_vgpr': True, ++ 'HasMFMA_xf32': False, ++ 'HasSMulHi': True, ++ 'HasWMMA': False, ++ 'KernargPreloading': True, ++ 'MaxLgkmcnt': 15, ++ 'MaxVmcnt': 63, ++ 'SupportedISA': True, ++ 'SupportedSource': True, ++ 'VOP3v_dot4_i32_i8': True, ++ 'v_dot2_f32_f16': True, ++ 'v_dot2c_f32_f16': True, ++ 'v_dot4_i32_i8': False, ++ 'v_dot4c_i32_i8': True, ++ 'v_fma_f16': True, ++ 'v_fma_f32': True, ++ 'v_fma_f64': True, ++ 'v_fma_mix_f32': True, ++ 'v_fmac_f16': False, ++ 'v_fmac_f32': True, ++ 'v_mac_f16': True, ++ 'v_mac_f32': False, ++ 'v_mad_mix_f32': False, ++ 'v_mov_b64': True, ++ 'v_pk_fma_f16': True, ++ 'v_pk_fmac_f16': False}, + (10, 1, 0): {'HasAddLshl': True, + 'HasAtomicAdd': False, + 'HasDirectToLdsDest': False, +diff --git a/Tensile/Common.py b/Tensile/Common.py +index 4d212d977c3d..107dcb272c61 100644 +--- a/Tensile/Common.py ++++ b/Tensile/Common.py +@@ -244,7 +244,7 @@ globalParameters["NumMergedFiles"] = 1 # The number of files that ker + globalParameters["MaxFileName"] = 64 # If a file name would be longer than this, shorten it with a hash. + globalParameters["SupportedISA"] = [(8,0,3), + (9,0,0), (9,0,6), (9,0,8), (9,0,10), +- (9,4,0), (9,4,1), (9,4,2), ++ (9,4,0), (9,4,1), (9,4,2), (9,5,0), + (10,1,0), (10,1,1), (10,1,2), (10,3,0), (10,3,1), (10,3,5), + (11,0,0), (11,0,1), (11,0,2), (11,0,3), + (11,5,0), (11,5,1), (11,5,2), +@@ -321,6 +321,7 @@ architectureMap = { + 'gfx940':'aquavanjaram', 'gfx940:xnack+':'aquavanjaram', 'gfx940:xnack-':'aquavanjaram', + 'gfx941':'aquavanjaram941', 'gfx941:xnack+':'aquavanjaram941', 'gfx941:xnack-':'aquavanjaram941', + 'gfx942':'aquavanjaram942', 'gfx942:xnack+':'aquavanjaram942', 'gfx942:xnack-':'aquavanjaram942', ++ 'gfx950':'gfx950', 'gfx950:xnack+':'gfx950', 'gfx950:xnack-':'gfx950', + 'gfx1010':'navi10', 'gfx1011':'navi12', 'gfx1012':'navi14', + 'gfx1030':'navi21', 'gfx1031':'navi22', 'gfx1032':'navi23', 'gfx1034':'navi24', 'gfx1035':'rembrandt', + 'gfx1100':'navi31', 'gfx1101':'navi32', 'gfx1102':'navi33', +@@ -2157,17 +2158,17 @@ def GetAsmCaps(isaVersion: IsaVersion, compilerVersion: CompilerVersion) -> Dict + def GetArchCaps(isaVersion): + rv = {} + rv["HasEccHalf"] = (isaVersion==(9,0,6) or isaVersion==(9,0,8) or isaVersion==(9,0,10) or \ +- isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2)) ++ isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2) or isaVersion==(9,5,0)) + rv["Waitcnt0Disabled"] = (isaVersion==(9,0,8) or isaVersion==(9,0,10) or \ +- isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2)) ++ isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2) or isaVersion==(9,5,0)) + rv["SeparateVscnt"] = isaVersion[0] in (10, 11) + rv["CMPXWritesSGPR"] = isaVersion[0] not in (10, 11, 12) + rv["HasWave32"] = isaVersion[0] in (10, 11, 12) +- rv["HasAccCD"] = (isaVersion==(9,0,10) or isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2)) +- rv["ArchAccUnifiedRegs"] = (isaVersion==(9,0,10) or isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2)) ++ rv["HasAccCD"] = (isaVersion==(9,0,10) or isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2) or isaVersion==(9,5,0)) ++ rv["ArchAccUnifiedRegs"] = (isaVersion==(9,0,10) or isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2) or isaVersion==(9,5,0)) + rv["VgprBank"] = isaVersion[0] in (10, 11, 12) + rv["InstRename"] = isaVersion[0]>=11 +- rv["CrosslaneWait"] = (isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2)) ++ rv["CrosslaneWait"] = (isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2) or isaVersion==(9,5,0)) + rv["ForceStoreSC1"] = (isaVersion==(9,4,0) or isaVersion==(9,4,1)) + + return rv +@@ -2466,7 +2467,7 @@ def assignGlobalParameters( config, capabilitiesCache: Optional[dict] = None ): + if os.name == "nt": + globalParameters["CurrentISA"] = (9,0,6) + printWarning("Failed to detect ISA so forcing (gfx906) on windows") +- isasWithDisabledHWMonitor = ((9,4,1), (9,4,2), (11,0,0), (11,0,1), (11,0,2), (11,0,3), (11,5,0), (11,5,1), (11,5,2), (12,0,0), (12,0,1)) ++ isasWithDisabledHWMonitor = ((9,4,1), (9,4,2), (9,5,0), (11,0,0), (11,0,1), (11,0,2), (11,0,3), (11,5,0), (11,5,1), (11,5,2), (12,0,0), (12,0,1)) + if globalParameters["CurrentISA"] in isasWithDisabledHWMonitor: + isaString = ', '.join(map(gfxName, isasWithDisabledHWMonitor)) + printWarning(f"HardwareMonitor currently disabled for {isaString}") +diff --git a/Tensile/Source/lib/include/Tensile/AMDGPU.hpp b/Tensile/Source/lib/include/Tensile/AMDGPU.hpp +index 2317ce79f8f2..e65a4831e082 100644 +--- a/Tensile/Source/lib/include/Tensile/AMDGPU.hpp ++++ b/Tensile/Source/lib/include/Tensile/AMDGPU.hpp +@@ -63,6 +63,7 @@ namespace Tensile + gfx940 = 940, + gfx941 = 941, + gfx942 = 942, ++ gfx950 = 950, + gfx1010 = 1010, + gfx1011 = 1011, + gfx1012 = 1012, +@@ -100,6 +101,8 @@ namespace Tensile + return "gfx941"; + case AMDGPU::Processor::gfx942: + return "gfx942"; ++ case AMDGPU::Processor::gfx950: ++ return "gfx950"; + case AMDGPU::Processor::gfx1010: + return "gfx1010"; + case AMDGPU::Processor::gfx1011: +@@ -168,6 +171,10 @@ namespace Tensile + { + return AMDGPU::Processor::gfx942; + } ++ else if(deviceString.find("gfx950") != std::string::npos) ++ { ++ return AMDGPU::Processor::gfx950; ++ } + else if(deviceString.find("gfx1010") != std::string::npos) + { + return AMDGPU::Processor::gfx1010; +diff --git a/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp b/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp +index f838f15d3ac4..ba9719f77bb2 100644 +--- a/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp ++++ b/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp +@@ -47,6 +47,7 @@ namespace Tensile + gfx940, + gfx941, + gfx942, ++ gfx950, + gfx1010, + gfx1011, + gfx1012, +@@ -88,6 +89,8 @@ namespace Tensile + return "TensileLibrary_*_gfx941"; + case LazyLoadingInit::gfx942: + return "TensileLibrary_*_gfx942"; ++ case LazyLoadingInit::gfx950: ++ return "TensileLibrary_*_gfx950"; + case LazyLoadingInit::gfx1010: + return "TensileLibrary_*_gfx1010"; + case LazyLoadingInit::gfx1011: +-- +2.50.1 + diff --git a/python-tensile.spec b/python-tensile.spec index f1e1617..8e14012 100644 --- a/python-tensile.spec +++ b/python-tensile.spec @@ -20,7 +20,7 @@ Name: python-tensile-devel Name: python-tensile %endif Version: %{rocm_version} -Release: 4%{?dist} +Release: 5%{?dist} Summary: Tool for creating benchmark-driven backend libraries for GEMMs URL: https://github.com/ROCmSoftwarePlatform/Tensile @@ -28,6 +28,8 @@ License: MIT Source0: %{url}/archive/rocm-%{version}.tar.gz#/%{upstreamname}-%{version}.tar.gz Patch1: 0001-tensile-fedora-gpus.patch +Patch2: 0001-tensile-gfx950.patch +Patch3: 0001-tensile-gfx1153.patch %if 0%{?fedora} || 0%{?suse_version} BuildRequires: fdupes @@ -181,6 +183,10 @@ mv %{buildroot}%{_datadir}/cmake/Tensile/*.cmake %{buildroot}%{python3_sitelib}/ %{python_sitelib}/%{upstreamname}*.egg-info/* %changelog +* Sun Jul 27 2025 Tom Rix - 6.4.0-5 +- patch in gfx950 support from develop branch +- patch in gfx1153 support + * Fri Jul 25 2025 Fedora Release Engineering - 6.4.0-4 - Rebuilt for https://fedoraproject.org/wiki/Fedora_43_Mass_Rebuild