Add support for gfx950 and gfx1153

Signed-off-by: Tom Rix <Tom.Rix@amd.com>
This commit is contained in:
2025-07-28 05:59:42 -07:00
parent ddea4f83dc
commit d688094aa3
3 changed files with 361 additions and 1 deletions

174
0001-tensile-gfx1153.patch Normal file
View File

@@ -0,0 +1,174 @@
From 6d6d64c3065b3485ff764c4e368849ba0d41d0e4 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sun, 27 Jul 2025 13:06:25 -0700
Subject: [PATCH] tensile gfx1153
cut-n-paste from gfx1152
Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
Tensile/AsmCaps.py | 44 +++++++++++++++++++
Tensile/Common.py | 6 +--
Tensile/Source/CMakeLists.txt | 4 +-
Tensile/Source/lib/include/Tensile/AMDGPU.hpp | 7 +++
.../include/Tensile/PlaceholderLibrary.hpp | 3 ++
5 files changed, 59 insertions(+), 5 deletions(-)
diff --git a/Tensile/AsmCaps.py b/Tensile/AsmCaps.py
index ea0518752ac4..83514e6d8f03 100644
--- a/Tensile/AsmCaps.py
+++ b/Tensile/AsmCaps.py
@@ -1035,6 +1035,50 @@ CACHED_ASM_CAPS = \
'v_mov_b64': False,
'v_pk_fma_f16': True,
'v_pk_fmac_f16': False},
+ (11, 5, 3): {'HasAddLshl': True,
+ 'HasAtomicAdd': True,
+ 'HasDirectToLdsDest': False,
+ 'HasDirectToLdsNoDest': False,
+ 'HasExplicitCO': True,
+ 'HasExplicitNC': True,
+ 'HasGLCModifier': True,
+ 'HasNTModifier': False,
+ 'HasLshlOr': True,
+ 'HasMFMA': False,
+ 'HasMFMA_b8': False,
+ 'HasMFMA_bf16_1k': False,
+ 'HasMFMA_bf16_original': False,
+ 'HasMFMA_constSrc': False,
+ 'HasMFMA_f64': False,
+ 'HasMFMA_f8': False,
+ 'HasMFMA_i8_908': False,
+ 'HasMFMA_i8_940': False,
+ 'HasMFMA_vgpr': False,
+ 'HasMFMA_xf32': False,
+ 'HasSMulHi': True,
+ 'HasWMMA': True,
+ 'KernargPreloading': False,
+ 'MaxLgkmcnt': 15,
+ 'MaxVmcnt': 63,
+ 'SupportedISA': True,
+ 'SupportedSource': True,
+ 'VOP3v_dot4_i32_i8': False,
+ 'v_dot2_f32_f16': True,
+ 'v_dot2c_f32_f16': True,
+ 'v_dot4_i32_i8': False,
+ 'v_dot4c_i32_i8': False,
+ 'v_fma_f16': True,
+ 'v_fma_f32': True,
+ 'v_fma_f64': True,
+ 'v_fma_mix_f32': True,
+ 'v_fmac_f16': False,
+ 'v_fmac_f32': True,
+ 'v_mac_f16': False,
+ 'v_mac_f32': False,
+ 'v_mad_mix_f32': False,
+ 'v_mov_b64': False,
+ 'v_pk_fma_f16': True,
+ 'v_pk_fmac_f16': False},
(12, 0, 0): {'HasAddLshl': True,
'HasAtomicAdd': False,
'HasDirectToLdsDest': False,
diff --git a/Tensile/Common.py b/Tensile/Common.py
index 107dcb272c61..f66e7a274953 100644
--- a/Tensile/Common.py
+++ b/Tensile/Common.py
@@ -247,7 +247,7 @@ globalParameters["SupportedISA"] = [(8,0,3),
(9,4,0), (9,4,1), (9,4,2), (9,5,0),
(10,1,0), (10,1,1), (10,1,2), (10,3,0), (10,3,1), (10,3,5),
(11,0,0), (11,0,1), (11,0,2), (11,0,3),
- (11,5,0), (11,5,1), (11,5,2),
+ (11,5,0), (11,5,1), (11,5,2), (11,5,3),
(12,0,0), (12,0,1)] # assembly kernels writer supports these architectures
globalParameters["KeepBuildTmp"] = True # Do not remove build artifacts during the build process or build_tmp after build completes
@@ -325,7 +325,7 @@ architectureMap = {
'gfx1010':'navi10', 'gfx1011':'navi12', 'gfx1012':'navi14',
'gfx1030':'navi21', 'gfx1031':'navi22', 'gfx1032':'navi23', 'gfx1034':'navi24', 'gfx1035':'rembrandt',
'gfx1100':'navi31', 'gfx1101':'navi32', 'gfx1102':'navi33',
- 'gfx1103':'gfx1103', 'gfx1150':'gfx1150', 'gfx1151':'gfx1151', 'gfx1152':'gfx1152',
+ 'gfx1103':'gfx1103', 'gfx1150':'gfx1150', 'gfx1151':'gfx1151', 'gfx1152':'gfx1152', 'gfx1153':'gfx1153',
'gfx1200':'gfx1200',
'gfx1201':'gfx1201'
}
@@ -2467,7 +2467,7 @@ def assignGlobalParameters( config, capabilitiesCache: Optional[dict] = None ):
if os.name == "nt":
globalParameters["CurrentISA"] = (9,0,6)
printWarning("Failed to detect ISA so forcing (gfx906) on windows")
- isasWithDisabledHWMonitor = ((9,4,1), (9,4,2), (9,5,0), (11,0,0), (11,0,1), (11,0,2), (11,0,3), (11,5,0), (11,5,1), (11,5,2), (12,0,0), (12,0,1))
+ isasWithDisabledHWMonitor = ((9,4,1), (9,4,2), (9,5,0), (11,0,0), (11,0,1), (11,0,2), (11,0,3), (11,5,0), (11,5,1), (11,5,2), (11,5,3), (12,0,0), (12,0,1))
if globalParameters["CurrentISA"] in isasWithDisabledHWMonitor:
isaString = ', '.join(map(gfxName, isasWithDisabledHWMonitor))
printWarning(f"HardwareMonitor currently disabled for {isaString}")
diff --git a/Tensile/Source/CMakeLists.txt b/Tensile/Source/CMakeLists.txt
index 7f10ee319518..1002b29bb981 100644
--- a/Tensile/Source/CMakeLists.txt
+++ b/Tensile/Source/CMakeLists.txt
@@ -51,9 +51,9 @@ if(CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" OR CMAKE_CXX_COMPILER MATCHES ".*clang
endif()
if(CMAKE_CXX_COMPILER STREQUAL "hipcc")
- set(TENSILE_GPU_ARCHS gfx803 gfx900 gfx906:xnack- gfx908:xnack- gfx90a:xnack- gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 CACHE STRING "GPU architectures")
+ set(TENSILE_GPU_ARCHS gfx803 gfx900 gfx906:xnack- gfx908:xnack- gfx90a:xnack- gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1153 CACHE STRING "GPU architectures")
else()
- set(TENSILE_GPU_ARCHS gfx803 gfx900 gfx906 gfx908 gfx90a gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 CACHE STRING "GPU architectures")
+ set(TENSILE_GPU_ARCHS gfx803 gfx900 gfx906 gfx908 gfx90a gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1034 gfx1035 gfx1100 gfx1101 gfx1102 gfx1103 gfx1150 gfx1151 gfx1152 gfx1153 CACHE STRING "GPU architectures")
endif()
include(CMakeDependentOption)
diff --git a/Tensile/Source/lib/include/Tensile/AMDGPU.hpp b/Tensile/Source/lib/include/Tensile/AMDGPU.hpp
index e65a4831e082..5e0a43c0fdab 100644
--- a/Tensile/Source/lib/include/Tensile/AMDGPU.hpp
+++ b/Tensile/Source/lib/include/Tensile/AMDGPU.hpp
@@ -79,6 +79,7 @@ namespace Tensile
gfx1150 = 1150,
gfx1151 = 1151,
gfx1152 = 1152,
+ gfx1153 = 1153,
};
static std::string toString(Processor p)
@@ -133,6 +134,8 @@ namespace Tensile
return "gfx1151";
case AMDGPU::Processor::gfx1152:
return "gfx1152";
+ case AMDGPU::Processor::gfx1153:
+ return "gfx1153";
}
return "";
}
@@ -219,6 +222,10 @@ namespace Tensile
{
return AMDGPU::Processor::gfx1152;
}
+ else if(deviceString.find("gfx1153") != std::string::npos)
+ {
+ return AMDGPU::Processor::gfx1153;
+ }
else
{
return static_cast<AMDGPU::Processor>(0);
diff --git a/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp b/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp
index ba9719f77bb2..3d1d43e0c9d0 100644
--- a/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp
+++ b/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp
@@ -63,6 +63,7 @@ namespace Tensile
gfx1151,
gfx1150,
gfx1152,
+ gfx1153,
All
};
@@ -121,6 +122,8 @@ namespace Tensile
return "TensileLibrary_*_gfx1151";
case LazyLoadingInit::gfx1152:
return "TensileLibrary_*_gfx1152";
+ case LazyLoadingInit::gfx1153:
+ return "TensileLibrary_*_gfx1153";
case LazyLoadingInit::None:
return "";
}
--
2.50.1

180
0001-tensile-gfx950.patch Normal file
View File

@@ -0,0 +1,180 @@
From e6851f038000be90cd29f3d530834e35111351c3 Mon Sep 17 00:00:00 2001
From: Tom Rix <Tom.Rix@amd.com>
Date: Sun, 27 Jul 2025 12:20:36 -0700
Subject: [PATCH] tensile gfx950
Copy gfx950 from the develop branch at commit
01ab9e776518ff8fda3a0086a3f3f9d17cd95f59
Signed-off-by: Tom Rix <Tom.Rix@amd.com>
---
Tensile/AsmCaps.py | 44 +++++++++++++++++++
Tensile/Common.py | 15 ++++---
Tensile/Source/lib/include/Tensile/AMDGPU.hpp | 7 +++
.../include/Tensile/PlaceholderLibrary.hpp | 3 ++
4 files changed, 62 insertions(+), 7 deletions(-)
diff --git a/Tensile/AsmCaps.py b/Tensile/AsmCaps.py
index 78ffa73bd81a..ea0518752ac4 100644
--- a/Tensile/AsmCaps.py
+++ b/Tensile/AsmCaps.py
@@ -419,6 +419,50 @@ CACHED_ASM_CAPS = \
'v_mov_b64': True,
'v_pk_fma_f16': True,
'v_pk_fmac_f16': False},
+ (9, 5, 0): {'HasAddLshl': True,
+ 'HasAtomicAdd': True,
+ 'HasDirectToLdsDest': False,
+ 'HasDirectToLdsNoDest': True,
+ 'HasExplicitCO': True,
+ 'HasExplicitNC': False,
+ 'HasGLCModifier': False,
+ 'HasNTModifier': True,
+ 'HasLshlOr': True,
+ 'HasMFMA': True,
+ 'HasMFMA_b8': True,
+ 'HasMFMA_bf16_1k': True,
+ 'HasMFMA_bf16_original': False,
+ 'HasMFMA_constSrc': True,
+ 'HasMFMA_f64': True,
+ 'HasMFMA_f8': True,
+ 'HasMFMA_i8_908': False,
+ 'HasMFMA_i8_940': True,
+ 'HasMFMA_vgpr': True,
+ 'HasMFMA_xf32': False,
+ 'HasSMulHi': True,
+ 'HasWMMA': False,
+ 'KernargPreloading': True,
+ 'MaxLgkmcnt': 15,
+ 'MaxVmcnt': 63,
+ 'SupportedISA': True,
+ 'SupportedSource': True,
+ 'VOP3v_dot4_i32_i8': True,
+ 'v_dot2_f32_f16': True,
+ 'v_dot2c_f32_f16': True,
+ 'v_dot4_i32_i8': False,
+ 'v_dot4c_i32_i8': True,
+ 'v_fma_f16': True,
+ 'v_fma_f32': True,
+ 'v_fma_f64': True,
+ 'v_fma_mix_f32': True,
+ 'v_fmac_f16': False,
+ 'v_fmac_f32': True,
+ 'v_mac_f16': True,
+ 'v_mac_f32': False,
+ 'v_mad_mix_f32': False,
+ 'v_mov_b64': True,
+ 'v_pk_fma_f16': True,
+ 'v_pk_fmac_f16': False},
(10, 1, 0): {'HasAddLshl': True,
'HasAtomicAdd': False,
'HasDirectToLdsDest': False,
diff --git a/Tensile/Common.py b/Tensile/Common.py
index 4d212d977c3d..107dcb272c61 100644
--- a/Tensile/Common.py
+++ b/Tensile/Common.py
@@ -244,7 +244,7 @@ globalParameters["NumMergedFiles"] = 1 # The number of files that ker
globalParameters["MaxFileName"] = 64 # If a file name would be longer than this, shorten it with a hash.
globalParameters["SupportedISA"] = [(8,0,3),
(9,0,0), (9,0,6), (9,0,8), (9,0,10),
- (9,4,0), (9,4,1), (9,4,2),
+ (9,4,0), (9,4,1), (9,4,2), (9,5,0),
(10,1,0), (10,1,1), (10,1,2), (10,3,0), (10,3,1), (10,3,5),
(11,0,0), (11,0,1), (11,0,2), (11,0,3),
(11,5,0), (11,5,1), (11,5,2),
@@ -321,6 +321,7 @@ architectureMap = {
'gfx940':'aquavanjaram', 'gfx940:xnack+':'aquavanjaram', 'gfx940:xnack-':'aquavanjaram',
'gfx941':'aquavanjaram941', 'gfx941:xnack+':'aquavanjaram941', 'gfx941:xnack-':'aquavanjaram941',
'gfx942':'aquavanjaram942', 'gfx942:xnack+':'aquavanjaram942', 'gfx942:xnack-':'aquavanjaram942',
+ 'gfx950':'gfx950', 'gfx950:xnack+':'gfx950', 'gfx950:xnack-':'gfx950',
'gfx1010':'navi10', 'gfx1011':'navi12', 'gfx1012':'navi14',
'gfx1030':'navi21', 'gfx1031':'navi22', 'gfx1032':'navi23', 'gfx1034':'navi24', 'gfx1035':'rembrandt',
'gfx1100':'navi31', 'gfx1101':'navi32', 'gfx1102':'navi33',
@@ -2157,17 +2158,17 @@ def GetAsmCaps(isaVersion: IsaVersion, compilerVersion: CompilerVersion) -> Dict
def GetArchCaps(isaVersion):
rv = {}
rv["HasEccHalf"] = (isaVersion==(9,0,6) or isaVersion==(9,0,8) or isaVersion==(9,0,10) or \
- isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2))
+ isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2) or isaVersion==(9,5,0))
rv["Waitcnt0Disabled"] = (isaVersion==(9,0,8) or isaVersion==(9,0,10) or \
- isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2))
+ isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2) or isaVersion==(9,5,0))
rv["SeparateVscnt"] = isaVersion[0] in (10, 11)
rv["CMPXWritesSGPR"] = isaVersion[0] not in (10, 11, 12)
rv["HasWave32"] = isaVersion[0] in (10, 11, 12)
- rv["HasAccCD"] = (isaVersion==(9,0,10) or isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2))
- rv["ArchAccUnifiedRegs"] = (isaVersion==(9,0,10) or isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2))
+ rv["HasAccCD"] = (isaVersion==(9,0,10) or isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2) or isaVersion==(9,5,0))
+ rv["ArchAccUnifiedRegs"] = (isaVersion==(9,0,10) or isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2) or isaVersion==(9,5,0))
rv["VgprBank"] = isaVersion[0] in (10, 11, 12)
rv["InstRename"] = isaVersion[0]>=11
- rv["CrosslaneWait"] = (isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2))
+ rv["CrosslaneWait"] = (isaVersion==(9,4,0) or isaVersion==(9,4,1) or isaVersion==(9,4,2) or isaVersion==(9,5,0))
rv["ForceStoreSC1"] = (isaVersion==(9,4,0) or isaVersion==(9,4,1))
return rv
@@ -2466,7 +2467,7 @@ def assignGlobalParameters( config, capabilitiesCache: Optional[dict] = None ):
if os.name == "nt":
globalParameters["CurrentISA"] = (9,0,6)
printWarning("Failed to detect ISA so forcing (gfx906) on windows")
- isasWithDisabledHWMonitor = ((9,4,1), (9,4,2), (11,0,0), (11,0,1), (11,0,2), (11,0,3), (11,5,0), (11,5,1), (11,5,2), (12,0,0), (12,0,1))
+ isasWithDisabledHWMonitor = ((9,4,1), (9,4,2), (9,5,0), (11,0,0), (11,0,1), (11,0,2), (11,0,3), (11,5,0), (11,5,1), (11,5,2), (12,0,0), (12,0,1))
if globalParameters["CurrentISA"] in isasWithDisabledHWMonitor:
isaString = ', '.join(map(gfxName, isasWithDisabledHWMonitor))
printWarning(f"HardwareMonitor currently disabled for {isaString}")
diff --git a/Tensile/Source/lib/include/Tensile/AMDGPU.hpp b/Tensile/Source/lib/include/Tensile/AMDGPU.hpp
index 2317ce79f8f2..e65a4831e082 100644
--- a/Tensile/Source/lib/include/Tensile/AMDGPU.hpp
+++ b/Tensile/Source/lib/include/Tensile/AMDGPU.hpp
@@ -63,6 +63,7 @@ namespace Tensile
gfx940 = 940,
gfx941 = 941,
gfx942 = 942,
+ gfx950 = 950,
gfx1010 = 1010,
gfx1011 = 1011,
gfx1012 = 1012,
@@ -100,6 +101,8 @@ namespace Tensile
return "gfx941";
case AMDGPU::Processor::gfx942:
return "gfx942";
+ case AMDGPU::Processor::gfx950:
+ return "gfx950";
case AMDGPU::Processor::gfx1010:
return "gfx1010";
case AMDGPU::Processor::gfx1011:
@@ -168,6 +171,10 @@ namespace Tensile
{
return AMDGPU::Processor::gfx942;
}
+ else if(deviceString.find("gfx950") != std::string::npos)
+ {
+ return AMDGPU::Processor::gfx950;
+ }
else if(deviceString.find("gfx1010") != std::string::npos)
{
return AMDGPU::Processor::gfx1010;
diff --git a/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp b/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp
index f838f15d3ac4..ba9719f77bb2 100644
--- a/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp
+++ b/Tensile/Source/lib/include/Tensile/PlaceholderLibrary.hpp
@@ -47,6 +47,7 @@ namespace Tensile
gfx940,
gfx941,
gfx942,
+ gfx950,
gfx1010,
gfx1011,
gfx1012,
@@ -88,6 +89,8 @@ namespace Tensile
return "TensileLibrary_*_gfx941";
case LazyLoadingInit::gfx942:
return "TensileLibrary_*_gfx942";
+ case LazyLoadingInit::gfx950:
+ return "TensileLibrary_*_gfx950";
case LazyLoadingInit::gfx1010:
return "TensileLibrary_*_gfx1010";
case LazyLoadingInit::gfx1011:
--
2.50.1

View File

@@ -20,7 +20,7 @@ Name: python-tensile-devel
Name: python-tensile Name: python-tensile
%endif %endif
Version: %{rocm_version} Version: %{rocm_version}
Release: 4%{?dist} Release: 5%{?dist}
Summary: Tool for creating benchmark-driven backend libraries for GEMMs Summary: Tool for creating benchmark-driven backend libraries for GEMMs
URL: https://github.com/ROCmSoftwarePlatform/Tensile URL: https://github.com/ROCmSoftwarePlatform/Tensile
@@ -28,6 +28,8 @@ License: MIT
Source0: %{url}/archive/rocm-%{version}.tar.gz#/%{upstreamname}-%{version}.tar.gz Source0: %{url}/archive/rocm-%{version}.tar.gz#/%{upstreamname}-%{version}.tar.gz
Patch1: 0001-tensile-fedora-gpus.patch Patch1: 0001-tensile-fedora-gpus.patch
Patch2: 0001-tensile-gfx950.patch
Patch3: 0001-tensile-gfx1153.patch
%if 0%{?fedora} || 0%{?suse_version} %if 0%{?fedora} || 0%{?suse_version}
BuildRequires: fdupes BuildRequires: fdupes
@@ -181,6 +183,10 @@ mv %{buildroot}%{_datadir}/cmake/Tensile/*.cmake %{buildroot}%{python3_sitelib}/
%{python_sitelib}/%{upstreamname}*.egg-info/* %{python_sitelib}/%{upstreamname}*.egg-info/*
%changelog %changelog
* Sun Jul 27 2025 Tom Rix <Tom.Rix@amd.com> - 6.4.0-5
- patch in gfx950 support from develop branch
- patch in gfx1153 support
* Fri Jul 25 2025 Fedora Release Engineering <releng@fedoraproject.org> - 6.4.0-4 * Fri Jul 25 2025 Fedora Release Engineering <releng@fedoraproject.org> - 6.4.0-4
- Rebuilt for https://fedoraproject.org/wiki/Fedora_43_Mass_Rebuild - Rebuilt for https://fedoraproject.org/wiki/Fedora_43_Mass_Rebuild